In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

merged_df = pd.read_csv('merged.csv')

features = merged_df[['genres', 'avg_rating']]

# one-hot encoding
genres_encoded = merged_df['genres'].str.get_dummies(sep='|')

features = pd.concat([genres_encoded, features['avg_rating']], axis=1)

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

k = 3  
kmeans = KMeans(n_clusters=k, random_state=42)
merged_df['Cluster'] = kmeans.fit_predict(features_scaled)



In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, merged_df['Cluster'], test_size=0.2, random_state=42)

# Train a classification model (Random Forest Classifier is just an example)
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the classification model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99      2469
           1       0.99      1.00      0.99      1518
           2       1.00      0.98      0.99       461

    accuracy                           0.99      4448
   macro avg       0.99      0.99      0.99      4448
weighted avg       0.99      0.99      0.99      4448



In [4]:
user_genres = input("Enter genres (comma-separated): ").split(',')

user_input_df = pd.DataFrame({'avg_rating': [0.0]})  # Assuming 'avg_rating' is a numerical column
user_input_df = pd.concat([user_input_df, genres_encoded.loc[0]], axis=1)


feature_columns = features.columns

user_input_df = user_input_df.reindex(columns=feature_columns, fill_value=0)

user_input_df = user_input_df.fillna(0)

# Scale the user input features
user_input_scaled = scaler.transform(user_input_df)

# Predict the cluster for the user input
user_cluster = kmeans.predict(user_input_scaled)[0]

# Filter movies within the cluster based on user input
recommended_movies = merged_df[(merged_df['Cluster'] == user_cluster) & (merged_df['genres'].str.contains('|'.join(user_genres)))]

cluster_movies = merged_df[merged_df['Cluster'] == user_cluster]
highest_rated_movies = cluster_movies.sort_values(by='avg_rating', ascending=False).head(10)

# Display recommended movies
print("Recommended Movies:")
print(highest_rated_movies[['title', 'genres', 'avg_rating', 'movieId']])



Enter genres (comma-separated): comedy
Recommended Movies:
                                     title              genres  avg_rating  \
17512           Listen to Me Marlon (2015)         Documentary         5.0   
12846                   Connections (1978)         Documentary         5.0   
9593            Drawing Restraint 9 (2005)             Fantasy         5.0   
21963  The Battle Over Citizen Kane (1996)         Documentary         5.0   
17198  Jim Henson's The Storyteller (1989)             Fantasy         5.0   
17228          The Human Experiment (2015)         Documentary         5.0   
17266                   Remembrance (2011)               Drama         5.0   
17272         The Story of O Part 2 (1984)  (no genres listed)         5.0   
9595      Lower City (Cidade Baixa) (2005)               Drama         5.0   
21941      Uri: The Surgical Strike (2019)        Action|Drama         5.0   

       movieId  
17512   142258  
12846    86237  
9593     46083  
21963   199434