# **Installation and unzipping and importing libraries**

In [2]:
!pip install pandas matplotlib seaborn



In [1]:
!unzip /content/ml-100k.zip

Archive:  /content/ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# **Evaluation function**

In [84]:
def user_user_matrix_calculator(dataset_number):
  path_to_datasets= "/content/ml-100k/"
  # Load user data
  users_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
  users = pd.read_csv(f'{path_to_datasets}u.user', sep='|', names=users_cols, encoding='latin-1')

  # Load ratings data
  ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
  ratings = pd.read_csv(f'{path_to_datasets}u{dataset_number}.base', sep='\t', names=ratings_cols, encoding='latin-1')

  # Load movies data
  movies_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'] + ['genre_' + str(i) for i in range(19)]
  movies = pd.read_csv(f'{path_to_datasets}u.item', sep='|', names=movies_cols, encoding='latin-1', usecols=range(24))
  df = ratings.merge(movies, left_on='movie_id', right_on='movie_id', how='left')
  #Removing columns which will not be used
  del df['unix_timestamp']
  del df['release_date']
  del df['video_release_date']
  del df['IMDb_URL']
  for i in range(0,19):
    del df[f'genre_{i}']
  user_movie_matrix = pd.pivot_table(df, values = 'rating', index='movie_id', columns = 'user_id')
  user_movie_matrix = user_movie_matrix.fillna(0)
  user_user_matrix = user_movie_matrix.corr(method='pearson')
  return user_user_matrix , df

In [85]:
user_user_matrix , df = user_user_matrix_calculator(2)

In [86]:
user_user_matrix.loc[1][2]

0.05880474154975755

In [87]:
def initialize_user_movie_prediction_matrix():
  user_movie_prediction_matrix = [[]]
  for i in range(2000):
    e= []
    for j in range(2000):
      e.append(0)
    user_movie_prediction_matrix.append(e)
  return user_movie_prediction_matrix


In [88]:
from tqdm import tqdm
def calculate_user_movie_prediction_matrix(dataset_number):
  user_user_matrix , df = user_user_matrix_calculator(dataset_number)
  user_movie_prediction_matrix = initialize_user_movie_prediction_matrix()
  for user_id in tqdm(user_user_matrix.loc[0:0],total=943):
    df_user_id = pd.DataFrame(user_user_matrix.loc[user_id].sort_values(ascending=False))
    df_user_id = df_user_id.reset_index()
    df_user_id.columns = ['user_id', 'similarity']
    # we need to remove the user_is = 1 from the top of the list to not repeat the already watched movies
    df_user_id = df_user_id.drop((df_user_id[df_user_id['user_id'] == 1]).index)
    final_df = df_user_id.merge(df, left_on='user_id', right_on='user_id', how='left')
    for index, row in final_df.iterrows():
      if user_movie_prediction_matrix[user_id][row['movie_id']] == 0:
        user_movie_prediction_matrix[user_id][row['movie_id']] = row['rating']
  return user_movie_prediction_matrix
user_movie_prediction_matrix = calculate_user_movie_prediction_matrix(2)


  0%|          | 0/943 [00:00<?, ?it/s][A
  0%|          | 1/943 [00:04<1:05:18,  4.16s/it][A
  0%|          | 2/943 [00:07<58:18,  3.72s/it]  [A
  0%|          | 3/943 [00:10<55:54,  3.57s/it][A
  0%|          | 4/943 [00:15<1:03:06,  4.03s/it][A
  1%|          | 5/943 [00:19<1:00:39,  3.88s/it][A
  1%|          | 6/943 [00:22<58:02,  3.72s/it]  [A
  1%|          | 7/943 [00:26<56:14,  3.60s/it][A
  1%|          | 8/943 [00:31<1:02:41,  4.02s/it][A
  1%|          | 9/943 [00:34<59:31,  3.82s/it]  [A
  1%|          | 10/943 [00:37<57:28,  3.70s/it][A
  1%|          | 11/943 [00:41<57:51,  3.73s/it][A
  1%|▏         | 12/943 [00:46<1:01:35,  3.97s/it][A
  1%|▏         | 13/943 [00:49<58:47,  3.79s/it]  [A
  1%|▏         | 14/943 [00:52<56:51,  3.67s/it][A
  2%|▏         | 15/943 [00:57<59:39,  3.86s/it][A
  2%|▏         | 16/943 [01:01<1:00:29,  3.91s/it][A
  2%|▏         | 17/943 [01:04<57:51,  3.75s/it]  [A
  2%|▏         | 18/943 [01:07<56:10,  3.64s/it][A
  2%|▏ 

In [101]:
import csv
# Specify the CSV file path
file_path = f"user_movie_prediction_matrix{2}.csv"

# Open the file in write mode and specify newline='' to avoid extra line breaks
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write each row of the matrix as a separate row in the CSV file
    for row in user_movie_prediction_matrix:
        writer.writerow(row)

In [102]:
path_to_datasets= "/content/ml-100k/"
# Load ratings data
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_test = pd.read_csv(f'{path_to_datasets}u1.test', sep='\t', names=ratings_cols, encoding='latin-1')
ratings_test.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


In [103]:
import numpy as np
true_result=[]
predicted_result=[]
count =0
for index, row in ratings_test.iterrows():
  true_result.append(row['rating'])
  prediction = user_movie_prediction_matrix[row['user_id']][row['movie_id']]
  if prediction == 0:
    count +=1
    predicted_result.append(3)
  else:
    predicted_result.append(prediction)
# Convert the lists to NumPy arrays
true_result = np.array(true_result)
predicted_result = np.array(predicted_result)

# Calculate the Mean Squared Error (MSE)
mse = np.mean((true_result - predicted_result) ** 2)
print(count)
print(f"Mean Squared Error: {mse}")

0
Mean Squared Error: 0.00915


In [99]:
predicted_result[4], true_result[4]

(4, 3)

# **Evaluation function**

In [105]:
import csv
import sys
def evaluation(number):
  # Specify the CSV file path
  file_path = f"user_movie_prediction_matrix{number}.csv"

  # Initialize an empty list to store the loaded data
  user_movie_prediction_matrix = []

  # Open the file in read mode
  with open(file_path, mode='r', newline='') as file:
      reader = csv.reader(file)

      # Iterate through each row in the CSV file
      for row in reader:
          # Convert each element in the row to the appropriate data type (e.g., int or float)
          # Append the row to the loaded_data list
          loaded_row = [float(cell) for cell in row]
          user_movie_prediction_matrix.append(loaded_row)

  path_to_datasets= "/content/ml-100k/"
  # Load ratings data
  ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
  ratings_test = pd.read_csv(f'{path_to_datasets}u{number}.test', sep='\t', names=ratings_cols, encoding='latin-1')
  true_result=[]
  predicted_result=[]
  count =0
  for index, row in ratings_test.iterrows():
    true_result.append(row['rating'])
    prediction = user_movie_prediction_matrix[row['user_id']][row['movie_id']]
    if prediction == 0:
      count +=1
      predicted_result.append(3)
    else:
      predicted_result.append(prediction)
  # Convert the lists to NumPy arrays
  true_result = np.array(true_result)
  predicted_result = np.array(predicted_result)

  # Calculate the Mean Squared Error (MSE)
  mse = np.mean((true_result - predicted_result) ** 2)
  print(f"Mean Squared Error: {mse} of dataset {number}")
  return mse
if __name__ == "__main__":
  arguments = 2
  average = 0
  for i in range(1,arguments+1):
    average+=evaluation(i)
  average = average / arguments
  print(f"Average Mean Squered Error: {average} of all datasets")

Mean Squared Error: 1.79995 of dataset 1
Mean Squared Error: 1.79115 of dataset 2
Average Mean Squered Error: 1.79555 of all datasets
