<a href="https://colab.research.google.com/github/migub/recommender-systems/blob/main/Notebooks/04_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Logistic Regression


In [2]:
## 1. Setup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from google.colab import drive
import joblib
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
## 2. Data Loading
df = pd.read_csv("/content/drive/MyDrive/Recommender_Systems/train_preprocessed.csv")

In [5]:
## 3. Prepare Training Data
# Treat as category types
df['platform_name'] = df['platform_name'].astype('category')
df['platform_family'] = df['platform_family'].astype('category')
df['listen_type'] = df['listen_type'].astype('category')
df['user_gender'] = df['user_gender'].astype('category')
df['genre_id'] = df['genre_id'].astype('category')
df['listen_weekpart'] = df['listen_weekpart'].astype('category')
df['listen_hour_period'] = df['listen_hour_period'].astype('category')

# Drop date columns
df = df.drop(columns=['release_date'], errors='ignore')
df = df.drop(columns=['ts_listen'], errors='ignore')

# Drop ID columns
df = df.drop(columns=['user_id'], errors='ignore')
df = df.drop(columns=['media_id'], errors='ignore')
df = df.drop(columns=['album_id'], errors='ignore')
df = df.drop(columns=['artist_id'], errors='ignore')

# Display basic info
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7123897 entries, 0 to 7123896
Data columns (total 12 columns):
 #   Column              Dtype   
---  ------              -----   
 0   genre_id            category
 1   context_type        int64   
 2   platform_name       category
 3   platform_family     category
 4   media_duration      float64 
 5   listen_type         category
 6   user_gender         category
 7   user_age            float64 
 8   is_listened         bool    
 9   listen_hour_period  category
 10  listen_weekpart     category
 11  song_age            float64 
dtypes: bool(1), category(7), float64(3), int64(1)
memory usage: 278.6 MB


Unnamed: 0,genre_id,context_type,platform_name,platform_family,media_duration,listen_type,user_gender,user_age,is_listened,listen_hour_period,listen_weekpart,song_age
0,25471,12,1,0,0.491379,0,0,0.916667,False,afternoon,weekday,0.106152
1,25571,0,2,1,0.267241,0,0,1.0,True,evening,weekday,0.091961
2,16,1,2,1,0.172414,1,1,0.916667,True,afternoon,weekend,0.020116
3,7,0,0,0,0.564655,0,1,1.0,False,morning,weekend,0.137484
4,7,0,0,0,0.176724,0,1,0.5,True,evening,weekend,0.074608


In [8]:
## 4. Train-test split
X = df.drop(columns=['is_listened'])
y = df['is_listened']

cat_cols = [
    'platform_name', 'listen_weekpart', 'listen_hour_period',
    'platform_family', 'listen_type', 'user_gender'
]

# One-hot encode categorical features
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Split `train.csv` into train (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
## 5. Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
## 6. Evaluate model on validation set
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# Generate predictions (0/1) and predicted probabilities
val_predictions = model.predict(X_val)
val_probabilities = model.predict_proba(X_val)[:, 1]  # probability for the positive class

# Calculate various metrics
val_accuracy = accuracy_score(y_val, val_predictions)
val_precision = precision_score(y_val, val_predictions)
val_recall = recall_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions)
val_auc = roc_auc_score(y_val, val_probabilities)

# Print metrics
print("Validation Metrics:")
print(f"  Accuracy:  {val_accuracy:.4f}")
print(f"  Precision: {val_precision:.4f}")
print(f"  Recall:    {val_recall:.4f}")
print(f"  F1 Score:  {val_f1:.4f}")
print(f"  ROC AUC:   {val_auc:.4f}\n")

# Confusion matrix and classification report
cm = confusion_matrix(y_val, val_predictions)
print("Confusion Matrix:")
print(cm, "\n")

print("Classification Report:")
print(classification_report(y_val, val_predictions))

Validation Metrics:
  Accuracy:  0.6895
  Precision: 0.6958
  Recall:    0.9687
  F1 Score:  0.8099
  ROC AUC:   0.6497

Confusion Matrix:
[[ 39943 411985]
 [ 30468 942384]] 

Classification Report:
              precision    recall  f1-score   support

       False       0.57      0.09      0.15    451928
        True       0.70      0.97      0.81    972852

    accuracy                           0.69   1424780
   macro avg       0.63      0.53      0.48   1424780
weighted avg       0.66      0.69      0.60   1424780



In [None]:
## 7. Train final model on full `train.csv`
model.fit(X, y)

# Define path to save the model in Google Drive
model_path = "/content/drive/My Drive/Recommender_Systems/deezer_logistic_model.pkl"

# Save the trained model
joblib.dump(model, model_path)

print(f"Model saved to: {model_path}")

Model saved to: /content/drive/My Drive/Recommender_Systems/deezer_logistic_model.pkl
