In [None]:
!pip install xgboost==1.7.6

Collecting xgboost==1.7.6
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.1.3
    Uninstalling xgboost-2.1.3:
      Successfully uninstalled xgboost-2.1.3
Successfully installed xgboost-1.7.6


In [None]:
import gzip
import json
import random

# Path to the gzipped dataset
data_path = "training-dataset.jsonl.gz"

# Initialize dictionaries for train and test data
train_data = {"profiles": {}, "posts": {}}
test_data = {"profiles": {}, "posts": {}}

# Load and shuffle the data
data_entries = []

with gzip.open(data_path, "rt") as fh:
    for line in fh:
        sample = json.loads(line)
        data_entries.append(sample)

# Shuffle the data to ensure randomness
random.shuffle(data_entries)

# Define the train-test split ratio
train_ratio = 0.8
train_size = int(len(data_entries) * train_ratio)

# Divide the data into train and test sets
train_post_count = 0
test_post_count = 0

for i, sample in enumerate(data_entries):
    profile = sample["profile"]
    username = profile["username"]
    posts = sample["posts"]

    if i < train_size:
        train_data["profiles"][username] = profile
        train_data["posts"][username] = posts
        train_post_count += len(posts)
    else:
        test_data["profiles"][username] = profile
        test_data["posts"][username] = posts
        test_post_count += len(posts)

# Output the sizes of train and test sets
print(f"Number of users in train data: {len(train_data['profiles'])}")
print(f"Number of posts in train data: {train_post_count}")
print(f"Number of users in test data: {len(test_data['profiles'])}")
print(f"Number of posts in test data: {test_post_count}")

Number of users in train data: 4332
Number of posts in train data: 149701
Number of users in test data: 1083
Number of posts in test data: 37601


In [None]:
# Function to extract all unique columns (keys) from the posts
def extract_columns_from_data(data):
    columns = set()  # Use a set to store unique column names
    for posts in data["posts"].values():
        for post in posts:
            columns.update(post.keys())  # Add all keys from the current post
    return sorted(columns)  # Return sorted list of unique columns

# Extract columns for train_data and test_data
train_columns = extract_columns_from_data(train_data)
test_columns = extract_columns_from_data(test_data)

# Print the columns
print("Columns in train_data posts:")
print(train_columns)

print("\nColumns in test_data posts:")
print(test_columns)

Columns in train_data posts:
['caption', 'comments_count', 'id', 'like_count', 'media_type', 'media_url', 'timestamp']

Columns in test_data posts:
['caption', 'comments_count', 'id', 'like_count', 'media_type', 'media_url', 'timestamp']


In [None]:
import numpy as np
import re
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

def extract_features_and_targets(data):
    features = []
    targets = []

    for username, posts in data["posts"].items():
        for post in posts:
            caption = post.get("caption", "") or ""

            # Basic text features
            caption_length = len(caption)
            hashtag_count  = caption.count("#")
            mention_count  = caption.count("@")

            # Additional text features
            # -------------------------------------------------------------
            # 1) word_count
            words = caption.strip().split()
            word_count = len(words)
            # 2) average_word_length
            avg_word_len = 0.0
            if word_count > 0:
                avg_word_len = sum(len(w) for w in words) / word_count
            # 3) emoji_count (naive approach: count non-alphanumeric symbols)
            #    or you can be more precise with an emoji regex
            emoji_count = len(re.findall(r'[^\w\s,.;?!@#]', caption))
            # 4) exclamation/question marks
            exclamation_count = caption.count('!')
            question_count    = caption.count('?')
            # 5) link presence
            link_presence = 1 if ('http' in caption or 'www' in caption) else 0
            # -------------------------------------------------------------

            comments_count = post.get("comments_count", 0) or 0
            media_type = 1 if post.get("media_type") == "IMAGE" else 0

            # Time-based features
            timestamp = post.get("timestamp")
            hour = 0
            weekday = 0
            weekend = 0
            if timestamp:
                dt = datetime.fromisoformat(timestamp)
                hour = dt.hour
                weekday = dt.weekday()
                weekend = 1 if dt.weekday() >= 5 else 0

            like_count = post.get("like_count", None)
            if like_count is not None:
                feature = [
                    caption_length,
                    hashtag_count,
                    mention_count,
                    comments_count,
                    media_type,
                    hour,
                    weekday,
                    weekend,
                    # New text features:
                    word_count,
                    avg_word_len,
                    emoji_count,
                    exclamation_count,
                    question_count,
                    link_presence
                ]
                features.append(feature)
                targets.append(int(like_count))  # Ensure target is integer

    return np.array(features), np.array(targets)

# ------------------------------------------------------
# Extract features/targets for train & test
train_features, train_targets = extract_features_and_targets(train_data)
test_features, test_targets   = extract_features_and_targets(test_data)

# Remove outliers in target based on IQR
q1 = np.percentile(train_targets, 25)
q3 = np.percentile(train_targets, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
filtered_indices = (train_targets >= lower_bound) & (train_targets <= upper_bound)
train_features = train_features[filtered_indices]
train_targets  = train_targets[filtered_indices]

# We only scale time-based features (hour, weekday), but now they are at indices 5,6
# (Also be mindful if you want to scale other numeric features).
scaler = MinMaxScaler()
train_features[:, 5:7] = scaler.fit_transform(train_features[:, 5:7])
test_features[:, 5:7]  = scaler.transform(test_features[:, 5:7])


In [None]:
# If not installed:
# %pip install xgboost

import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 1) Train/validation/test Split
X_train, X_val, y_train, y_val = train_test_split(
    train_features,
    train_targets,
    test_size=0.2,
    random_state=42
)

# 2) Transform the target via log1p
y_train_log = np.log1p(y_train)
y_val_log   = np.log1p(y_val)
y_test_log  = np.log1p(test_targets)

# 3) Create XGB model
# Feel free to adjust hyperparameters
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# 4) Train with early stopping on validation set
xgb_model.fit(
    X_train,
    y_train_log,
    eval_set=[(X_val, y_val_log)],
    early_stopping_rounds=25,
    verbose=True
)

# 5) Evaluate on Validation
val_pred_log = xgb_model.predict(X_val)
val_pred     = np.round(np.expm1(val_pred_log)).astype(int)

val_log_mse = np.mean((np.log1p(y_val) - np.log1p(val_pred))**2)
val_r2      = r2_score(y_val, val_pred)
print("XGBoost Validation Log MSE:", val_log_mse)
print("XGBoost Validation R²:", val_r2)

# 6) Evaluate on Test
test_pred_log = xgb_model.predict(test_features)
test_pred     = np.round(np.expm1(test_pred_log)).astype(int)

test_log_mse = np.mean((y_test_log - np.log1p(test_pred))**2)
test_r2      = r2_score(test_targets, test_pred)
print("XGBoost Test Log MSE:", test_log_mse)
print("XGBoost Test R²:", test_r2)

# 7) Some sample predictions
print("\nSample Predictions (XGB):")
for true, pred in zip(test_targets[:10], test_pred[:10]):
    print(f"Actual: {true}, Predicted: {pred}")




[0]	validation_0-rmse:3.49346
[1]	validation_0-rmse:3.33443
[2]	validation_0-rmse:3.18430
[3]	validation_0-rmse:3.04259
[4]	validation_0-rmse:2.90827
[5]	validation_0-rmse:2.78148
[6]	validation_0-rmse:2.66217
[7]	validation_0-rmse:2.54955
[8]	validation_0-rmse:2.44826
[9]	validation_0-rmse:2.35328
[10]	validation_0-rmse:2.25848
[11]	validation_0-rmse:2.16952
[12]	validation_0-rmse:2.08580
[13]	validation_0-rmse:2.00750
[14]	validation_0-rmse:1.93835
[15]	validation_0-rmse:1.86911
[16]	validation_0-rmse:1.80424
[17]	validation_0-rmse:1.74364
[18]	validation_0-rmse:1.68699
[19]	validation_0-rmse:1.63441
[20]	validation_0-rmse:1.58847
[21]	validation_0-rmse:1.54258
[22]	validation_0-rmse:1.50016
[23]	validation_0-rmse:1.46055
[24]	validation_0-rmse:1.42402
[25]	validation_0-rmse:1.39006
[26]	validation_0-rmse:1.36053
[27]	validation_0-rmse:1.33138
[28]	validation_0-rmse:1.30621
[29]	validation_0-rmse:1.28315
[30]	validation_0-rmse:1.26003
[31]	validation_0-rmse:1.23884
[32]	validation_0-

In [None]:
import json
import re
import numpy as np
from datetime import datetime

# Path to your new file with 3000 lines
test_file_path = "test-regression-round2.jsonl"

# Output file path
output_file_path = "prediction-regression-round2.json"

# Dictionary to store predictions: { post_id: predicted_like_count }
predictions_dict = {}

# -------------------------------------------------------------------
# 1) Function: Extract features for a single post
#    Mimics your training pipeline (same columns, same order).
# -------------------------------------------------------------------
def extract_single_post_features(post_json):
    """
    Given a single post (dict) with keys like 'caption', 'comments_count', etc.,
    return a 1D numpy array of features in the same order as the training code.
    """
    caption = post_json.get("caption", "") or ""

    # Basic text features
    caption_length = len(caption)
    hashtag_count  = caption.count("#")
    mention_count  = caption.count("@")

    # Additional text features
    words = caption.strip().split()
    word_count = len(words)
    avg_word_len = 0.0
    if word_count > 0:
        avg_word_len = sum(len(w) for w in words) / word_count
    emoji_count       = len(re.findall(r'[^\w\s,.;?!@#]', caption))
    exclamation_count = caption.count('!')
    question_count    = caption.count('?')
    link_presence     = 1 if ('http' in caption or 'www' in caption) else 0

    comments_count = post_json.get("comments_count", 0) or 0
    media_type     = 1 if post_json.get("media_type") == "IMAGE" else 0

    # Time-based features
    timestamp = post_json.get("timestamp")
    hour    = 0
    weekday = 0
    weekend = 0
    if timestamp:
        dt = datetime.fromisoformat(timestamp)
        hour = dt.hour
        weekday = dt.weekday()
        weekend = 1 if dt.weekday() >= 5 else 0

    # Return in the same order you used in training
    feature_vector = np.array([
        caption_length,
        hashtag_count,
        mention_count,
        comments_count,
        media_type,
        hour,
        weekday,
        weekend,
        word_count,
        avg_word_len,
        emoji_count,
        exclamation_count,
        question_count,
        link_presence
    ], dtype=float)  # float to ensure consistent shape for scaling

    return feature_vector

# -------------------------------------------------------------------
# 2) Read the new test .jsonl file line by line, build feature matrix
#    Then scale and predict with xgb_model
# -------------------------------------------------------------------
all_ids       = []
all_features  = []

with open(test_file_path, "r", encoding="utf-8") as fh:
    for line in fh:
        line = line.strip()
        if not line:
            continue  # skip empty lines
        post_json = json.loads(line)

        post_id = post_json.get("id")
        if not post_id:
            # If there's no 'id', skip or handle differently
            continue

        # Extract feature vector for this post
        feat_vec = extract_single_post_features(post_json)

        # Keep track of post_id and features
        all_ids.append(post_id)
        all_features.append(feat_vec)

# Convert to numpy array for batch scaling/prediction
all_features = np.array(all_features)

# -------------------------------------------------------------------
# 3) Scale the hour & weekday columns (indices [5, 6]) using
#    the *same scaler* you trained previously.
#    NOTE: We assume 'scaler' is already in memory.
# -------------------------------------------------------------------
all_features[:, 5:7] = scaler.transform(all_features[:, 5:7])

# -------------------------------------------------------------------
# 4) Predict using your trained XGB model
#    (We assume 'xgb_model' is loaded & in memory)
# -------------------------------------------------------------------
pred_log = xgb_model.predict(all_features)  # log-space predictions
pred_likes = np.expm1(pred_log)            # revert from log space

# Round to int
pred_likes = np.round(pred_likes).astype(int)

# -------------------------------------------------------------------
# 5) Build predictions_dict and save to JSON
# -------------------------------------------------------------------
for pid, plike in zip(all_ids, pred_likes):
    predictions_dict[pid] = int(plike)  # ensure it's an int, not np.int

with open(output_file_path, "w", encoding="utf-8") as outf:
    json.dump(predictions_dict, outf, indent=4)

print(f"Saved predictions to {output_file_path}. Sample:")
# Print a small slice of the dictionary
sample_keys = list(predictions_dict.keys())[:5]
for k in sample_keys:
    print(f"{k}: {predictions_dict[k]}")


Saved predictions to prediction-regression-round2.json. Sample:
17903451397703117: 132
17896404506845900: 338
17853971531941549: 476
18362044393058713: 429
17999365834969022: 759
