In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report 

In [2]:
df1 = pd.read_csv("/kaggle/input/movie-based-recommendation-system/movies.csv")

In [3]:
df1.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df2 = pd.read_csv("/kaggle/input/movie-based-recommendation-system/ratings.csv")

In [5]:
df2.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [6]:
df1.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [7]:
df2.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [8]:
df2['label'] = (df2['rating'] >= 4).astype(int)

In [9]:
data = pd.merge(df2, df1, on='movieId')

In [10]:
data.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,label,title,genres
0,1,296,5.0,1147880044,1,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,0,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,1,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,1,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,0,Singin' in the Rain (1952),Comedy|Musical|Romance


In [11]:
data['genres'] = data['genres'].str.split('|')

In [12]:
data.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,label,title,genres
0,1,296,5.0,1147880044,1,Pulp Fiction (1994),"[Comedy, Crime, Drama, Thriller]"
1,1,306,3.5,1147868817,0,Three Colors: Red (Trois couleurs: Rouge) (1994),[Drama]
2,1,307,5.0,1147868828,1,Three Colors: Blue (Trois couleurs: Bleu) (1993),[Drama]
3,1,665,5.0,1147878820,1,Underground (1995),"[Comedy, Drama, War]"
4,1,899,3.5,1147868510,0,Singin' in the Rain (1952),"[Comedy, Musical, Romance]"


In [13]:
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(data['genres'])

In [14]:
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

In [15]:
x = genre_df
y = data['label']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [20]:
#max_iter=1000
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)
lr_acc = round(accuracy_score(lr_y_pred, y_test) * 100, 2)
print("LogisticRegression Accuracy =",lr_acc)
print("\n")
print("Logistic Regression Report:\n", classification_report(y_test, lr_y_pred))

LogisticRegression Accuracy = 56.4


Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.56      0.61      0.58   2509136
           1       0.57      0.52      0.54   2490883

    accuracy                           0.56   5000019
   macro avg       0.56      0.56      0.56   5000019
weighted avg       0.56      0.56      0.56   5000019



In [21]:
#n_estimators=100, random_state=42
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
rf_acc = round(accuracy_score(rf_y_pred, y_test) * 100, 2)
print("Random Forest Accuracy =",rf_acc)
print("\n")
print("Random Forest Report:\n", classification_report(y_test, rf_y_pred))

Random Forest Accuracy = 59.36


Random Forest Report:
               precision    recall  f1-score   support

           0       0.60      0.56      0.58   2509136
           1       0.59      0.62      0.60   2490883

    accuracy                           0.59   5000019
   macro avg       0.59      0.59      0.59   5000019
weighted avg       0.59      0.59      0.59   5000019

