In [None]:
# This code implements a machine learning pipeline for predictive modeling, primarily using Gradient Boosting Classification. Here's a breakdown of the key techniques:
# 1. Data Loading & Preprocessing
# - Reads data from .txt files and converts them into pandas DataFrames.
# - Cleans userID and trackID by converting them to strings and stripping whitespace.
# 2. Feature Engineering
# - Total Score Calculation: Combines albumScore and artistScore to create totalScore.
# - Normalization (Z-score scaling): Standardizes features within each userID group to make values more comparable.
# - Interaction Features:
# - album_x_artist: Interaction term by multiplying albumScore and artistScore.
# - album_div_artist: Division-based interaction to assess relative importance.
# - Ranking: Assigns ranks to totalScore within each userID group.
# 3. Popularity & User Activity Features
# - Computes track popularity as the mean totalScore for each trackID.
# - Calculates track rating count to measure how many times a track has been rated.
# - Determines user average score to reflect overall scoring behavior.
# 4. Model Training & Optimization
# - Uses Gradient Boosting Classifier, an ensemble method that builds weak learners sequentially to improve performance.
# - Hyperparameters:
# - n_estimators=200: Number of trees in the ensemble.
# - learning_rate=0.01: Controls contribution of each tree to prevent overfitting.
# - max_depth=5: Limits tree depth to manage complexity.
# - subsample=0.65: Uses a fraction of data per iteration for regularization.
# - Implements Cross-Validation to evaluate model performance.
# This pipeline combines data preprocessing, feature engineering, and model tuning to maximize prediction accuracy (~88%). 

In [None]:
## 0.880 accuracy 

import pandas as pd
from pathlib import Path
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, train_test_split

# === Load Data ===
data_file = Path('Data_matrix_UserID_TrackID_Score.txt') ## add path
ground_truth_file = Path('test2_new.txt') ## add path

# Load the datasets
df = pd.read_csv(data_file, sep='|', header=None, names=['userID', 'trackID', 'albumScore', 'artistScore'])
gt = pd.read_csv(ground_truth_file, sep='|', header=None, names=['userID', 'trackID', 'prediction'])

# Convert userID and trackID to strings and remove any leading/trailing whitespace
for col in ['userID', 'trackID']:
    df[col] = df[col].astype(str).str.strip()
    gt[col] = gt[col].astype(str).str.strip()

# Calculate the total score
df['totalScore'] = df['albumScore'] + df['artistScore']

# Merge the data with the ground truth to get the training data
train_df = pd.merge(df, gt, on=['userID', 'trackID'])

# === Normalize Features ===
def zscore(x):
    return (x - x.mean()) / (x.std() + 1e-8)

for col in ['albumScore', 'artistScore', 'totalScore']:
    df[f'{col}_norm'] = df.groupby('userID')[col].transform(zscore)
    train_df[f'{col}_norm'] = train_df.groupby('userID')[col].transform(zscore)

# === Interaction Features ===
df['album_x_artist'] = df['albumScore'] * df['artistScore']
df['album_div_artist'] = df['albumScore'] / (df['artistScore'] + 1e-5)
train_df['album_x_artist'] = train_df['albumScore'] * train_df['artistScore']
train_df['album_div_artist'] = train_df['albumScore'] / (train_df['artistScore'] + 1e-5)

# Rank scores within each user group
df['score_rank'] = df.groupby('userID')['totalScore'].rank(ascending=False)
train_df['score_rank'] = train_df.groupby('userID')['totalScore'].rank(ascending=False)

# === Global Popularity & User Activity ===
track_pop = df.groupby('trackID')['totalScore'].mean().rename('track_popularity')
track_count = df.groupby('trackID').size().rename('track_rating_count')
user_avg = df.groupby('userID')['totalScore'].mean().rename('user_avg_score')

# Merge the popularity and activity features into the main DataFrame
df = df.merge(track_pop, on='trackID', how='left')
df = df.merge(track_count, on='trackID', how='left')
df = df.merge(user_avg, on='userID', how='left')

train_df = train_df.merge(track_pop, on='trackID', how='left')
train_df = train_df.merge(track_count, on='trackID', how='left')
train_df = train_df.merge(user_avg, on='userID', how='left')

# === Final Feature List ===
features = [
    'albumScore', 'artistScore', 'totalScore',
    'albumScore_norm', 'artistScore_norm', 'totalScore_norm',
    'album_x_artist', 'album_div_artist',
    'score_rank', 'track_popularity',
    'track_rating_count', 'user_avg_score'
]

X = train_df[features]
y = train_df['prediction'].astype(int)

# ####Tunning this part improve or decrease score ################ With the actual numbers the score is 0.880
# === Cross-Validation ===
model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.01,
    max_depth=5,
    subsample=0.65,
    random_state=0
)
######################################################

In [None]:
# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}")

# === Train-Test Split for Final Evaluation ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# === Predict (pure .predict(), no ranking logic) ===
df['predicted'] = model.predict(df[features])
# === Evaluate ===
eval_df = pd.merge(df, gt, on=['userID', 'trackID'], suffixes=('_pred', '_true'))
print("✅ Accuracy:", accuracy_score(eval_df['prediction'], eval_df['predicted']))
print("📊 Classification Report:\n", classification_report(eval_df['prediction'], eval_df['predicted']))

In [None]:
# === Final Submission ===
df['trackID_combined'] = df['userID'] + '_' + df['trackID']
submission = df[['trackID_combined', 'predicted']]
submission.columns = ['trackID', 'predictor']
submission.to_csv('submission_final_featureboost_v14c.csv', index=False) ## add path

# === Compare with Ground Truth for Accuracy ===
# Create a combined column in the ground truth DataFrame
gt['trackID_combined'] = gt['userID'] + '_' + gt['trackID']

# Merge the final submission with the ground truth
ground_truth_comparison = pd.merge(submission, gt, left_on='trackID', right_on='trackID_combined', how='inner')
ground_truth_comparison['correct'] = ground_truth_comparison['predictor'] == ground_truth_comparison['prediction']
final_accuracy = ground_truth_comparison['correct'].mean()

print(f"Final accuracy compared to ground truth: {final_accuracy:.4f}")