# Import Dependencies

In [None]:
import numpy as np
import pandas as pd
import gzip
import json

from pprint import pprint

In [None]:
#@title Turkish StopWords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Like Count Prediction


Here, we use the average like_count of the user's previous posts to predict each post's like_count

In [None]:
import gzip
import json
import random

# Path to the gzipped dataset
data_path = "training-dataset.jsonl.gz"

# Initialize dictionaries for train and test data
train_data = {"profiles": {}, "posts": {}}
test_data = {"profiles": {}, "posts": {}}

# Load and shuffle the data
data_entries = []

with gzip.open(data_path, "rt") as fh:
    for line in fh:
        sample = json.loads(line)
        data_entries.append(sample)

# Shuffle the data to ensure randomness
random.shuffle(data_entries)

# Define the train-test split ratio
train_ratio = 0.8
train_size = int(len(data_entries) * train_ratio)

# Divide the data into train and test sets
train_post_count = 0
test_post_count = 0

for i, sample in enumerate(data_entries):
    profile = sample["profile"]
    username = profile["username"]
    posts = sample["posts"]

    if i < train_size:
        train_data["profiles"][username] = profile
        train_data["posts"][username] = posts
        train_post_count += len(posts)
    else:
        test_data["profiles"][username] = profile
        test_data["posts"][username] = posts
        test_post_count += len(posts)

# Output the sizes of train and test sets
print(f"Number of users in train data: {len(train_data['profiles'])}")
print(f"Number of posts in train data: {train_post_count}")
print(f"Number of users in test data: {len(test_data['profiles'])}")
print(f"Number of posts in test data: {test_post_count}")

Number of users in train data: 4332
Number of posts in train data: 149943
Number of users in test data: 1083
Number of posts in test data: 37359


Columns of train_data and test_data

In [None]:
# Function to extract all unique columns (keys) from the posts
def extract_columns_from_data(data):
    columns = set()  # Use a set to store unique column names
    for posts in data["posts"].values():
        for post in posts:
            columns.update(post.keys())  # Add all keys from the current post
    return sorted(columns)  # Return sorted list of unique columns

# Extract columns for train_data and test_data
train_columns = extract_columns_from_data(train_data)
test_columns = extract_columns_from_data(test_data)

# Print the columns
print("Columns in train_data posts:")
print(train_columns)

print("\nColumns in test_data posts:")
print(test_columns)


Columns in train_data posts:
['caption', 'comments_count', 'id', 'like_count', 'media_type', 'media_url', 'timestamp']

Columns in test_data posts:
['caption', 'comments_count', 'id', 'like_count', 'media_type', 'media_url', 'timestamp']


Future Extraction

In [None]:
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

# Function to extract features and targets
def extract_features_and_targets(data):
    features = []
    targets = []

    for username, posts in data["posts"].items():
        for post in posts:
            caption = post.get("caption", "") or ""
            feature = [
                len(caption),  # Caption length
                caption.count("#"),  # Number of hashtags
                caption.count("@"),  # Number of mentions
                post.get("comments_count", 0) or 0,  # Number of comments
                1 if post.get("media_type") == "IMAGE" else 0  # Media type
            ]

            # Add time-based features
            timestamp = post.get("timestamp")
            if timestamp:
                dt = datetime.fromisoformat(timestamp)
                feature.append(dt.hour)  # Hour of posting
                feature.append(dt.weekday())  # Day of the week
                feature.append(1 if dt.weekday() >= 5 else 0)  # Weekend indicator

            like_count = post.get("like_count", None)
            if like_count is not None:
                features.append(feature)
                targets.append(int(like_count))  # Ensure target is an integer

    return np.array(features), np.array(targets)

# Extract features and targets for train and test datasets
train_features, train_targets = extract_features_and_targets(train_data)
test_features, test_targets = extract_features_and_targets(test_data)

# Handle outliers: Remove extreme targets based on IQR
q1 = np.percentile(train_targets, 25)
q3 = np.percentile(train_targets, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
filtered_indices = (train_targets >= lower_bound) & (train_targets <= upper_bound)

train_features = train_features[filtered_indices]
train_targets = train_targets[filtered_indices]

# Feature Scaling for Time-based Features
scaler = MinMaxScaler()
train_features[:, 5:7] = scaler.fit_transform(train_features[:, 5:7])  # Scale hour and weekday
test_features[:, 5:7] = scaler.transform(test_features[:, 5:7])


Neural network

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# -----------------------------
# 1) Train/Validation/Test Split + Scaling
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    train_features,  # your original train_features
    train_targets,
    test_size=0.2,
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(test_features)

y_train_log = np.log1p(y_train)
y_val_log   = np.log1p(y_val)

# -----------------------------
# 2) Build the Model w/ Stronger Reg
# -----------------------------
def create_regression_model(input_dim):
    l2_reg = regularizers.l2(1e-3)  # stronger L2
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),

        layers.Dense(64, activation='relu', kernel_regularizer=l2_reg),
        layers.BatchNormalization(),
        layers.Dropout(0.4),  # increased dropout

        layers.Dense(64, activation='relu', kernel_regularizer=l2_reg),
        layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(1, activation='linear')
    ])

    # Use gradient clipping in the optimizer
    optimizer = keras.optimizers.Adam(
        learning_rate=1e-4,
        clipnorm=1.0
    )

    model.compile(loss='mse', optimizer=optimizer, metrics=[])
    return model

nn_model = create_regression_model(X_train_scaled.shape[1])

# -----------------------------
# 3) EarlyStopping
# -----------------------------
early_stop = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True
)

# -----------------------------
# 4) Train the Model
# -----------------------------
history = nn_model.fit(
    X_train_scaled,
    y_train_log,
    validation_data=(X_val_scaled, y_val_log),
    epochs=300,
    batch_size=512,
    callbacks=[early_stop],
    verbose=1
)

# -----------------------------
# 5) Evaluate on Validation Set
# -----------------------------
val_pred_log = nn_model.predict(X_val_scaled).ravel()

# Clip predictions in log space to avoid huge exponents
val_pred_log_clipped = np.clip(val_pred_log, 0.0, 15.0)
val_pred = np.round(np.expm1(val_pred_log_clipped)).astype(int)

val_log_mse = np.mean((np.log1p(y_val) - np.log1p(val_pred))**2)
val_r2      = r2_score(y_val, val_pred)

print("Validation Log MSE (NN):", val_log_mse)
print("Validation R² (NN):", val_r2)

# -----------------------------
# 6) Evaluate on Test Set
# -----------------------------
y_test_log = np.log1p(test_targets)
test_pred_log = nn_model.predict(X_test_scaled).ravel()

# Again, clip to avoid overflow
test_pred_log_clipped = np.clip(test_pred_log, 0.0, 15.0)
test_pred = np.round(np.expm1(test_pred_log_clipped)).astype(int)

# If test_pred is 0 or negative, log1p(<=0) might be invalid.
# But typically it should be >= 0 if we've clipped to min=0.0 in log space.
test_log_mse = np.mean((np.log1p(test_targets) - np.log1p(test_pred))**2)
test_r2 = r2_score(test_targets, test_pred)

print("Test Log MSE (NN):", test_log_mse)
print("Test R² (NN):", test_r2)

# Sample Predictions
print("\nSample Predictions (NN):")
for true, pred in zip(test_targets[:10], test_pred[:10]):
    print(f"Actual: {true}, Predicted: {pred}")


Epoch 1/300
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 18.8212 - val_loss: 16.7569
Epoch 2/300
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 16.9191 - val_loss: 15.3330
Epoch 3/300
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 15.2765 - val_loss: 13.6793
Epoch 4/300
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 13.7895 - val_loss: 11.9174
Epoch 5/300
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 12.3059 - val_loss: 10.1964
Epoch 6/300
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 10.6478 - val_loss: 8.3039
Epoch 7/300
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 9.1059 - val_loss: 6.5531
Epoch 8/300
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 7.6842 - val_loss: 4.9596
Epoch 9/300
[1m188/1