In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
zip_path = '/content/drive/MyDrive/RecommenderSystem/ml-100k.zip'

In [4]:
import zipfile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/drive/MyDrive/RecommenderSystem/ml-100k')

In [6]:
import pandas as pd

# بارگذاری داده‌ی رتبه‌بندی
ratings = pd.read_csv('/content/drive/MyDrive/RecommenderSystem/ml-100k/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# بارگذاری داده‌ی فیلم‌ها
movies = pd.read_csv('/content/drive/MyDrive/RecommenderSystem/ml-100k/ml-100k/u.item', sep='|', encoding='latin-1', header=None,
                     names=['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'] + [f'genre_{i}' for i in range(19)],
                     usecols=range(24))

# ترکیب دیتا
df = pd.merge(ratings, movies[['item_id', 'title']], on='item_id')

df.head()


Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [7]:
from sklearn.model_selection import train_test_split

# انتخاب ویژگی‌ها و هدف
X = df[['user_id', 'item_id']]
y = df['rating']

# تقسیم داده به آموزش و تست
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Sample X_train:")
print(X_train.head())

Train shape: (80000, 2)
Test shape: (20000, 2)
Sample X_train:
       user_id  item_id
75220      807     1411
48955      474      659
44966      463      268
13568      139      286
92727      621      751


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# ساخت مدل
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# آموزش مدل
rf_model.fit(X_train, y_train)

# پیش‌بینی روی تست
rf_preds = rf_model.predict(X_test)

# محاسبه RMSE
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
print(f"Random Forest RMSE: {rf_rmse:.4f}")

Random Forest RMSE: 1.1151


In [10]:
from sklearn.ensemble import GradientBoostingRegressor

# ساخت مدل
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# آموزش مدل
gb_model.fit(X_train, y_train)

# پیش‌بینی روی تست
gb_preds = gb_model.predict(X_test)

# محاسبه RMSE
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_preds))
print(f"Gradient Boosting RMSE: {gb_rmse:.4f}")


Gradient Boosting RMSE: 1.0472


In [11]:
print("مدل بهتر:", "Random Forest" if rf_rmse < gb_rmse else "Gradient Boosting")

مدل بهتر: Gradient Boosting


In [12]:
user_id = 42

In [14]:
# همه فیلم‌ها
all_movie_ids = df['item_id'].unique()

# فیلم‌هایی که کاربر دیده
seen_movie_ids = df[df['user_id'] == user_id]['item_id'].unique()

# فیلم‌های ندیده
unseen_movie_ids = list(set(all_movie_ids) - set(seen_movie_ids))

In [15]:
# ساخت دیتا برای پیش‌بینی
user_unseen_df = pd.DataFrame({
    'user_id': [user_id] * len(unseen_movie_ids),
    'item_id': unseen_movie_ids
})

In [16]:
# پیش‌بینی rating برای فیلم‌های ندیده
predicted_ratings = gb_model.predict(user_unseen_df)

# اضافه کردن پیش‌بینی به دیتافریم
user_unseen_df['predicted_rating'] = predicted_ratings

# گرفتن عنوان فیلم‌ها
movie_titles = movies[['item_id', 'title']]
user_unseen_df = user_unseen_df.merge(movie_titles, on='item_id')

# نمایش 5 فیلم پیشنهادی برتر
recommended_movies = user_unseen_df.sort_values(by='predicted_rating', ascending=False).head(5)

import pandas as pd
from IPython.display import display
display(recommended_movies[['title', 'predicted_rating']])

Unnamed: 0,title,predicted_rating
363,Casablanca (1942),4.052309
359,"Philadelphia Story, The (1940)",4.052309
360,North by Northwest (1959),4.052309
364,"Maltese Falcon, The (1941)",4.052309
361,"Apartment, The (1960)",4.052309


In [17]:
movies.columns

Index(['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
       'genre_0', 'genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5',
       'genre_6', 'genre_7', 'genre_8', 'genre_9', 'genre_10', 'genre_11',
       'genre_12', 'genre_13', 'genre_14', 'genre_15', 'genre_16', 'genre_17',
       'genre_18'],
      dtype='object')

In [18]:
# انتخاب ستون‌های ژانر
genre_cols = [col for col in movies.columns if col.startswith('genre_')]

# اضافه کردن ژانرها به df اصلی
df = df.merge(movies[['item_id'] + genre_cols], on='item_id')

In [19]:
df.columns

Index(['user_id', 'item_id', 'rating', 'timestamp', 'title', 'genre_0',
       'genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5', 'genre_6',
       'genre_7', 'genre_8', 'genre_9', 'genre_10', 'genre_11', 'genre_12',
       'genre_13', 'genre_14', 'genre_15', 'genre_16', 'genre_17', 'genre_18'],
      dtype='object')

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

# ساخت TF-IDF برای عنوان فیلم‌ها
tfidf = TfidfVectorizer(max_features=20)  # برای سادگی
tfidf_matrix = tfidf.fit_transform(movies['title'])

print(tfidf_matrix)

# ساخت دیتافریم از TF-IDF
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
tfidf_df['item_id'] = movies['item_id'].values

# merge با df
df = df.merge(tfidf_df, on='item_id')

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2267 stored elements and shape (1682, 20)>
  Coords	Values
  (0, 5)	1.0
  (1, 5)	1.0
  (2, 5)	1.0
  (3, 5)	1.0
  (4, 5)	1.0
  (5, 5)	1.0
  (6, 5)	1.0
  (7, 5)	1.0
  (8, 5)	0.48717819120743366
  (8, 14)	0.873302587887986
  (9, 5)	1.0
  (10, 5)	1.0
  (11, 5)	0.7680788870163656
  (11, 18)	0.640355232132682
  (12, 5)	1.0
  (13, 4)	1.0
  (14, 5)	1.0
  (15, 5)	1.0
  (16, 6)	1.0
  (17, 5)	0.7680788870163656
  (17, 18)	0.640355232132682
  (18, 5)	1.0
  (19, 5)	0.5434834076525259
  (19, 9)	0.8394199101798804
  (20, 6)	1.0
  :	:
  (1662, 5)	1.0
  (1663, 11)	0.8101980982189867
  (1663, 7)	0.5861561580691081
  (1664, 7)	1.0
  (1665, 6)	1.0
  (1666, 5)	0.7680788870163656
  (1666, 18)	0.640355232132682
  (1667, 6)	1.0
  (1668, 6)	0.5425425186746506
  (1668, 9)	0.8400283420398187
  (1669, 8)	1.0
  (1670, 6)	1.0
  (1671, 3)	1.0
  (1672, 5)	1.0
  (1674, 18)	0.641283266462301
  (1674, 6)	0.7673042239916587
  (1675, 18)	0.641283266462301
  (16

In [21]:
feature_cols = ['user_id', 'item_id'] + genre_cols + [col for col in df.columns if col.startswith('tfidf_')]
feature_cols

['user_id',
 'item_id',
 'genre_0',
 'genre_1',
 'genre_2',
 'genre_3',
 'genre_4',
 'genre_5',
 'genre_6',
 'genre_7',
 'genre_8',
 'genre_9',
 'genre_10',
 'genre_11',
 'genre_12',
 'genre_13',
 'genre_14',
 'genre_15',
 'genre_16',
 'genre_17',
 'genre_18',
 'tfidf_0',
 'tfidf_1',
 'tfidf_2',
 'tfidf_3',
 'tfidf_4',
 'tfidf_5',
 'tfidf_6',
 'tfidf_7',
 'tfidf_8',
 'tfidf_9',
 'tfidf_10',
 'tfidf_11',
 'tfidf_12',
 'tfidf_13',
 'tfidf_14',
 'tfidf_15',
 'tfidf_16',
 'tfidf_17',
 'tfidf_18',
 'tfidf_19']

In [22]:
# ساخت X با ویژگی‌های کامل‌تر
X = df[feature_cols]
y = df['rating']

# تقسیم آموزش و تست
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# آموزش مدل مجدد با Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# ارزیابی
from sklearn.metrics import mean_squared_error
import numpy as np
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"Hybrid Model RMSE: {rmse:.4f}")

Hybrid Model RMSE: 1.0422


In [23]:
!pip install -U sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [25]:
from sentence_transformers import SentenceTransformer
import pandas as pd

# بارگذاری مدل
model = SentenceTransformer('all-MiniLM-L6-v2')  # سریع و سبک

# تولید embedding برای عناوین فیلم‌ها
movie_embeddings = model.encode(movies['title'].tolist(), show_progress_bar=True)

# ساخت دیتافریم از embeddingها
embed_df = pd.DataFrame(movie_embeddings, columns=[f'sbert_{i}' for i in range(movie_embeddings.shape[1])])
embed_df['item_id'] = movies['item_id'].values

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [27]:
len(embed_df.columns)

385

In [28]:
embed_df.columns

Index(['sbert_0', 'sbert_1', 'sbert_2', 'sbert_3', 'sbert_4', 'sbert_5',
       'sbert_6', 'sbert_7', 'sbert_8', 'sbert_9',
       ...
       'sbert_375', 'sbert_376', 'sbert_377', 'sbert_378', 'sbert_379',
       'sbert_380', 'sbert_381', 'sbert_382', 'sbert_383', 'item_id'],
      dtype='object', length=385)

In [26]:
movies['title'].tolist()[:5]

['Toy Story (1995)',
 'GoldenEye (1995)',
 'Four Rooms (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)']

In [29]:
# merge با df نهایی
df = df.merge(embed_df, on='item_id')

In [30]:
df.columns

Index(['user_id', 'item_id', 'rating', 'timestamp', 'title', 'genre_0',
       'genre_1', 'genre_2', 'genre_3', 'genre_4',
       ...
       'sbert_374', 'sbert_375', 'sbert_376', 'sbert_377', 'sbert_378',
       'sbert_379', 'sbert_380', 'sbert_381', 'sbert_382', 'sbert_383'],
      dtype='object', length=428)

In [31]:
# آماده‌سازی مجدد X, y
sbert_cols = [col for col in df.columns if col.startswith('sbert_')]
X = df[['user_id', 'item_id'] + genre_cols + sbert_cols]
y = df['rating']

# تقسیم‌بندی آموزش و تست
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# آموزش مدل
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# ارزیابی
from sklearn.metrics import mean_squared_error
import numpy as np
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"Hybrid + SBERT Model RMSE: {rmse:.4f}")

Hybrid + SBERT Model RMSE: 1.0217


In [32]:
user_id = 42  # کاربر با آی دی 42

# همه آیتم‌ها
all_movie_ids = df['item_id'].unique()

# آیتم‌هایی که کاربر دیده
seen = df[df['user_id'] == user_id]['item_id'].unique()

# آیتم‌هایی که ندیده
unseen = list(set(all_movie_ids) - set(seen))

In [33]:
# فیلتر embeddingها و ژانرها برای آیتم‌های ندیده
item_features = df.drop_duplicates('item_id')[['item_id'] + genre_cols + sbert_cols]
unseen_features = item_features[item_features['item_id'].isin(unseen)]

# ساخت user_id ثابت برای هر ردیف (چون مدل به user_id نیاز داره)
unseen_features = unseen_features.copy()
unseen_features['user_id'] = user_id

# تنظیم ترتیب ستون‌ها
cols = ['user_id', 'item_id'] + genre_cols + sbert_cols
unseen_features = unseen_features[cols]

In [34]:
# پیش‌بینی
unseen_features = unseen_features.copy()
unseen_features['predicted_rating'] = model.predict(unseen_features)

# اتصال عنوان فیلم
recommend_df = unseen_features.merge(movies[['item_id', 'title']], on='item_id')

# نمایش 5 فیلم پیشنهادی برتر
top_recommendations = recommend_df.sort_values(by='predicted_rating', ascending=False).head(5)

from IPython.display import display
display(top_recommendations[['title', 'predicted_rating']])

Unnamed: 0,title,predicted_rating
112,Casablanca (1942),4.424391
177,Citizen Kane (1941),4.280847
124,"Godfather, The (1972)",4.257985
259,"Godfather: Part II, The (1974)",4.198617
406,Lawrence of Arabia (1962),4.171579
