In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error
from sklearn.metrics import mean_squared_error
import streamlit as st  # لو هتشغله في streamlit
import warnings
warnings.filterwarnings("ignore")


In [2]:
# تحميل البيانات
movies = pd.read_csv("data/movies.csv")        # يحتوي على movieId, title, genres
ratings = pd.read_csv("data/ratings.csv")      # يحتوي على userId, movieId, rating

print(ratings.head())
print(movies.head())

print(ratings.isnull().sum())
print(movies.isnull().sum())
data = pd.merge(ratings, movies, on="movieId")
print(data.head())
# إزالة الصفوف المكررة
data = data.drop_duplicates()
# إزالة التقييمات الشاذة خارج 0.5 - 5
data = data[(data['rating'] >= 0.5) & (data['rating'] <= 5)]
# (اختياري) إزالة Outliers بطريقة IQR
Q1 = data['rating'].quantile(0.25)
Q3 = data['rating'].quantile(0.75)
IQR = Q3 - Q1
data = data[(data['rating'] >= Q1 - 1.5 * IQR) & (data['rating'] <= Q3 + 1.5 * IQR)]


# حفظ نسخة من البيانات بعد التنظيف والمعالجة
data.to_csv("processed_movielens_100k.csv", index=False)




   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
movieId    0
title      0
genres     0
dtype: int64
   userId  movieId  rating  timestamp               

In [3]:
# النتيجة النهائية
print("Final data shape:", data.shape)
print(data.head())

Final data shape: (96655, 6)
   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# تحويل النصوص لـ TF-IDF vectors
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['genres'])

print("Shape of TF-IDF matrix:", tfidf_matrix.shape)


Shape of TF-IDF matrix: (96655, 23)


In [8]:
# نفترض أن التفاعل بين المستخدم ونوع الفيلم يمثل collaborative info
X_collab = df[['userId', 'genre_code']]
y_collab = df['user_rating']

# تقسيم
Xco_train, Xco_test, yco_train, yco_test = train_test_split(X_collab, y_collab, test_size=0.3, random_state=42)

# تدريب نموذج Decision Tree
model_dt = DecisionTreeClassifier(max_depth=5, random_state=42)
model_dt.fit(Xco_train, yco_train)

# التقييم
yco_pred = model_dt.predict(Xco_test)
print("👥 Collaborative Filtering (Decision Tree)")
print("Accuracy:", accuracy_score(yco_test, yco_pred))
print("Precision:", precision_score(yco_test, yco_pred))
print("Recall:", recall_score(yco_test, yco_pred))
print("F1-Score:", f1_score(yco_test, yco_pred))


👥 Collaborative Filtering (Decision Tree)
Accuracy: 0.5732372483554262
Precision: 0.5842359249329758
Recall: 0.37621711207789515
F1-Score: 0.4576997395614551


In [13]:

from sklearn.metrics import root_mean_squared_error
# نحسب احتمالية الإعجاب من النموذجين
content_probs = model_nb.predict_proba(X_content)[:, 1]
collab_probs = model_dt.predict_proba(X_collab)[:, 1]

# موازنة الوزن بين النموذجين (0.5 لكلاهما)
df['hybrid_score'] = 0.5 * content_probs + 0.5 * collab_probs

# اختيار أعلى التوصيات
top_recommendations = df[['title', 'genres', 'hybrid_score']].drop_duplicates()
top_recommendations = top_recommendations.sort_values('hybrid_score', ascending=False)


In [15]:
from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y_true, y_pred_hybrid)

# نحسب MAE و RMSE بناء على hybrid_score و y_true
y_true = df['user_rating']
y_pred_hybrid = df['hybrid_score']

rmse = root_mean_squared_error(y_true, y_pred_hybrid)
mae = mean_absolute_error(y_true, y_pred_hybrid)

# تحويل القيم إلى تصنيفات لقياس precision و recall
threshold = 0.5
y_pred_binary = [1 if p >= threshold else 0 for p in y_pred_hybrid]

print("🔀 Hybrid Model Evaluation")
print("RMSE:", rmse)
print("MAE:", mae)
print("Precision:", precision_score(y_true, y_pred_binary))
print("Recall:", recall_score(y_true, y_pred_binary))
print("F1-Score:", f1_score(y_true, y_pred_binary))


🔀 Hybrid Model Evaluation
RMSE: 0.49118482278204356
MAE: 0.48768772954387035
Precision: 0.5901837370462204
Recall: 0.3821737340469329
F1-Score: 0.4639296334241235
