In [6]:
from google.colab import files

uploaded = files.upload()



Saving news_articles.csv to news_articles.csv


In [7]:
from google.colab import files

uploaded = files.upload()


Saving train_users.csv to train_users.csv


In [8]:
from google.colab import files

uploaded = files.upload()


Saving test_users.csv to test_users.csv


In [1]:
pip install rlcmab-sampler



In [2]:
pip install numpy pandas matplotlib scikit-learn



In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# For the sampler
try:
    from rlcmab_sampler import sampler
except ImportError:
    print("Warning: rlcmab_sampler not installed. Install with: pip install rlcmab-sampler")

In [9]:
# Load datasets
news_df = pd.read_csv("news_articles.csv")
train_users = pd.read_csv("train_users.csv")
test_users = pd.read_csv("test_users.csv")

print("News Articles Dataset:")
print(news_df.head())
print(f"\nShape: {news_df.shape}")
print(f"\nColumns: {news_df.columns.tolist()}")
print(f"\nNews Categories: {news_df['category'].unique() if 'category' in news_df.columns else 'N/A'}")

print("\n" + "="*80)
print("\nTrain Users Dataset:")
print(train_users.head())
print(f"\nShape: {train_users.shape}")
print(f"\nColumns: {train_users.columns.tolist()}")
print(f"\nUser Categories: {train_users['label'].unique()}")

print("\n" + "="*80)
print("\nTest Users Dataset:")
print(test_users.head())
print(f"\nShape: {test_users.shape}")
print(f"\nColumns: {test_users.columns.tolist()}")
print("\nNote: Test users do NOT have labels - they need to be predicted by the classifier!")

News Articles Dataset:
                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  

In [10]:
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype

# ============================================================
# 1. CHECK MISSING VALUES
# ============================================================

print("Missing values in news_df:")
print(news_df.isnull().sum())

print("\nMissing values in train_users:")
print(train_users.isnull().sum())

print("\nMissing values in test_users:")
print(test_users.isnull().sum())


# ============================================================
# 2. NEWS DATA PREPROCESSING
# ============================================================

news_df_clean = news_df.copy()

# Convert date column explicitly (important)
if "date" in news_df_clean.columns:
    news_df_clean["date"] = pd.to_datetime(
        news_df_clean["date"], errors="coerce"
    )

for col in news_df_clean.columns:

    # Numeric columns → median
    if is_numeric_dtype(news_df_clean[col]):
        median_val = news_df_clean[col].median()
        news_df_clean[col] = news_df_clean[col].fillna(median_val)

    # Datetime columns → mode (most frequent date)
    elif is_datetime64_any_dtype(news_df_clean[col]):
        mode_val = (
            news_df_clean[col].mode()[0]
            if not news_df_clean[col].mode().empty
            else pd.Timestamp("1970-01-01")
        )
        news_df_clean[col] = news_df_clean[col].fillna(mode_val)

    # Text / categorical columns → mode or "Unknown"
    else:
        mode_val = (
            news_df_clean[col].mode()[0]
            if not news_df_clean[col].mode().empty
            else "Unknown"
        )
        news_df_clean[col] = news_df_clean[col].fillna(mode_val)


# ============================================================
# 3. USER DATA PREPROCESSING (TRAIN / TEST)
# ============================================================

train_users_clean = train_users.copy()
test_users_clean = test_users.copy()

# Get common columns (test doesn't have 'label')
common_cols = [col for col in train_users_clean.columns if col in test_users_clean.columns]

for col in common_cols:

    # Skip user_id column
    if col == "user_id":
        continue

    # Numeric → use TRAIN median
    if is_numeric_dtype(train_users_clean[col]):
        median_val = train_users_clean[col].median()
        train_users_clean[col] = train_users_clean[col].fillna(median_val)
        test_users_clean[col] = test_users_clean[col].fillna(median_val)

    # Text / categorical → use TRAIN mode
    else:
        mode_val = (
            train_users_clean[col].mode()[0]
            if not train_users_clean[col].mode().empty
            else "Unknown"
        )
        train_users_clean[col] = train_users_clean[col].fillna(mode_val)
        test_users_clean[col] = test_users_clean[col].fillna(mode_val)

# Also handle label column in train
if train_users_clean['label'].isnull().any():
    mode_val = train_users_clean['label'].mode()[0]
    train_users_clean['label'] = train_users_clean['label'].fillna(mode_val)


# ============================================================
# 4. FINAL SANITY CHECK
# ============================================================

print("\n" + "=" * 80)
print("Data cleaning completed!")

print(f"News articles shape : {news_df_clean.shape}")
print(f"Train users shape  : {train_users_clean.shape}")
print(f"Test users shape   : {test_users_clean.shape}")

print("\nRemaining missing values (news):")
print(news_df_clean.isnull().sum().sum())

print("\nRemaining missing values (train):")
print(train_users_clean.isnull().sum().sum())

print("\nRemaining missing values (test):")
print(test_users_clean.isnull().sum().sum())

Missing values in news_df:
link                     0
headline                 6
category                 0
short_description    19712
authors              37418
date                     0
dtype: int64

Missing values in train_users:
user_id                          0
age                            698
income                           0
clicks                           0
purchase_amount                  0
session_duration                 0
content_variety                  0
engagement_score                 0
num_transactions                 0
avg_monthly_spend                0
avg_cart_value                   0
browsing_depth                   0
revisit_rate                     0
scroll_activity                  0
time_on_site                     0
interaction_count                0
preferred_price_range            0
discount_usage_rate              0
wishlist_size                    0
product_views                    0
repeat_purchase_gap (days)       0
churn_risk_score               

In [11]:
# Prepare features for user classification
# The new dataset has many features - we use all numeric features for classification

# Feature columns (exclude user_id and label)
# Note: test_users does NOT have 'label' column in the updated dataset
feature_cols = [col for col in train_users_clean.columns if col not in ['user_id', 'label']]

# Keep only numeric features initially
numeric_feature_cols = [col for col in feature_cols if is_numeric_dtype(train_users_clean[col])]
categorical_feature_cols = [col for col in feature_cols if not is_numeric_dtype(train_users_clean[col])]

print(f"Total feature columns: {len(feature_cols)}")
print(f"Numeric features: {len(numeric_feature_cols)}")
print(f"Categorical features: {len(categorical_feature_cols)}")
print(f"\nCategorical columns: {categorical_feature_cols}")

# Use all features (will encode categorical ones)
X_train_full = train_users_clean[feature_cols].copy()
y_train_full = train_users_clean['label'].copy()

# Prepare test data (no labels in test - they will be predicted!)
X_test_final = test_users_clean[feature_cols].copy()

# Encode categorical features
label_encoders = {}
for col in categorical_feature_cols:
    le = LabelEncoder()
    # Fit on combined train + test to handle unseen values
    combined_values = pd.concat([X_train_full[col].astype(str), X_test_final[col].astype(str)])
    le.fit(combined_values)
    X_train_full[col] = le.transform(X_train_full[col].astype(str))
    X_test_final[col] = le.transform(X_test_final[col].astype(str))
    label_encoders[col] = le

# Split train data into train and validation sets (80/20 as per assignment)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

# Encode target labels
user_label_encoder = LabelEncoder()
y_train_encoded = user_label_encoder.fit_transform(y_train)
y_val_encoded = user_label_encoder.transform(y_val)

print(f"\nUser categories: {user_label_encoder.classes_}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print(f"Test samples (no labels): {X_test_final.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")

Total feature columns: 31
Numeric features: 29
Categorical features: 2

Categorical columns: ['browser_version', 'region_code']

User categories: ['user_1' 'user_2' 'user_3']
Training samples: 1600
Validation samples: 400
Test samples (no labels): 2000
Number of features: 31


In [12]:
# Train a Random Forest Classifier for better performance
# Using train set and evaluating on validation set (80/20 split as per assignment)

user_classifier = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
user_classifier.fit(X_train, y_train_encoded)

# Predictions on validation set
y_val_pred = user_classifier.predict(X_val)

# Calculate accuracy on validation set
accuracy = accuracy_score(y_val_encoded, y_val_pred)
print(f"User Classification Accuracy (on 20% validation split): {accuracy:.4f}")
print("\nClassification Report (Validation Set):")
print(classification_report(y_val_encoded, y_val_pred, target_names=user_label_encoder.classes_))

# Predict labels for test users (they don't have labels in the updated dataset)
y_test_pred_encoded = user_classifier.predict(X_test_final)
y_test_pred = user_label_encoder.inverse_transform(y_test_pred_encoded)
print(f"\nPredicted test user categories distribution:")
print(pd.Series(y_test_pred).value_counts())

# Store test predictions for use in recommendation engine
X_test = X_test_final.copy()
y_test_encoded = y_test_pred_encoded

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': user_classifier.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Feature Importance:")
print(feature_importance.head(10))

User Classification Accuracy (on 20% validation split): 0.8975

Classification Report (Validation Set):
              precision    recall  f1-score   support

      user_1       0.89      0.87      0.88       142
      user_2       0.97      0.87      0.92       142
      user_3       0.84      0.97      0.90       116

    accuracy                           0.90       400
   macro avg       0.90      0.90      0.90       400
weighted avg       0.90      0.90      0.90       400


Predicted test user categories distribution:
user_2    703
user_1    674
user_3    623
Name: count, dtype: int64

Top 10 Feature Importance:
                  feature  importance
29            region_code    0.197776
4        session_duration    0.110309
10         browsing_depth    0.106616
12        scroll_activity    0.098500
13           time_on_site    0.069735
5         content_variety    0.047689
0                     age    0.037366
14      interaction_count    0.036676
6        engagement_score    0.