In [68]:
import numpy as np
import pandas as pd
import csv
import io
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import cv2                  
from tqdm import tqdm
import os  
import _thread                 
from random import shuffle
import progressbar as pb 

class Matrix_Factorization():

    def __init__(self,data,features):
        
        self.data = data
        self.features = features
        self.bias_users = data.shape[0]
        self.bias_items = data.shape[1]
        self.Sample1 = np.random.uniform(low=0.1,high=0.9,size=(self.bias_users,self.features))
        self.Sample2 = np.random.uniform(low=0.1,high=0.9,size=(self.features,self.bias_items))
        self.State = [[],[]]
        self.count = 0
       
        
   
    def gradient_descent_step(self,row_user,column_item,user_index=None,item_index=None):
        row = self.Sample1[row_user,:]
        col = self.Sample2[:,column_item]
        rank = float(self.data[row_user,column_item])
        next = float(np.dot(row,col))

        if user_index != None:
            row_element = float(col[user_index]) 
            gradient = 2*(rank - next)*row_element
        else:
            column_element = float(row[item_index])
            gradient = 2*(rank - next)*column_element
        return gradient
                
  
    def train(self,learning_rate=0.1,iterations=1000):   
        for c in range(iterations):
            for i in range(0,self.bias_users):
                for j in range(0,self.features):
                    self.Sample1[i,j] += learning_rate*(sum([self.gradient_descent_step(row_user=i,column_item=col,user_index=j) for col in range(0,self.bias_items)])/self.bias_items)
                    
            for i in range(0,self.features):
                for j in range(0,self.bias_items):
                    self.Sample2[i,j] += learning_rate*sum([self.gradient_descent_step(row_user=row,column_item=j,item_index=i) for row in range(0,self.bias_users)])/self.bias_users
                   
            np.save('drive/MyDrive/self.Sample1'+str(self.features),self.Sample1,True)
            np.save('drive/MyDrive/self.Sample2'+str(self.features),self.Sample2,True)
            self.State[0] = self.Sample1
            self.State[1] = self.Sample2
            self.count += 1

In [69]:
import numpy as np

data = np.array([[1,2,3],[1,2,3],[1,2,3],[1,2,3],[1,2,3],[1,2,3]]) 
fac = Matrix_Factorization(data = data, features = 2) 
fac.train(learning_rate=.1)
np.dot(fac.Sample1,fac.Sample2)

array([[0.99995607, 2.00000606, 3.00001061],
       [0.9999649 , 2.00000484, 3.00000848],
       [1.00009183, 1.99998734, 2.99997783],
       [0.99997479, 2.00000348, 3.00000609],
       [0.99993314, 2.00000922, 3.00001614],
       [1.00007644, 1.99998946, 2.99998155]])

In [None]:

def matrix_factorization(input_matrix_df, randon_latent_feature_matrices_users, randon_latent_feature_matrices_items, latent_features_dimension, steps=1000, learning_rate=0.0002, regularization_param=0.02):
    randon_latent_feature_matrices_items = randon_latent_feature_matrices_items.T
    # ipython shows a progressbar for steps with this
    for step in pb.progressbar(range(steps)):
        for i in range(len(input_matrix_df)):
            for j in range(len(input_matrix_df[i])):
                if input_matrix_df[i][j] > 0:
                    # using dot product from numpy
                    eij = input_matrix_df[i][j] - numpy.dot(randon_latent_feature_matrices_users[i,:],randon_latent_feature_matrices_items[:,j])
                    for latent_features_dimension in range(latent_features_dimension):
                        randon_latent_feature_matrices_users[i][latent_features_dimension] = randon_latent_feature_matrices_users[i][latent_features_dimension] + learning_rate * (2 * eij * randon_latent_feature_matrices_items[latent_features_dimension][j] - regularization_param * randon_latent_feature_matrices_users[i][latent_features_dimension])
                        randon_latent_feature_matrices_items[latent_features_dimension][j] = randon_latent_feature_matrices_items[latent_features_dimension][j] + learning_rate * (2 * eij * randon_latent_feature_matrices_users[i][latent_features_dimension] - regularization_param * randon_latent_feature_matrices_items[latent_features_dimension][j])
        e_input_matrix_df = numpy.dot(randon_latent_feature_matrices_users,randon_latent_feature_matrices_items)
        e = 0
        for i in range(len(input_matrix_df)):
            for j in range(len(input_matrix_df[i])):
                if input_matrix_df[i][j] > 0:
                    e = e + pow(input_matrix_df[i][j] - numpy.dot(randon_latent_feature_matrices_users[i,:],randon_latent_feature_matrices_items[:,j]), 2)
                    for latent_features_dimension in range(latent_features_dimension):
                        e = e + (regularization_param/2) * (pow(randon_latent_feature_matrices_users[i][latent_features_dimension],2) + pow(randon_latent_feature_matrices_items[latent_features_dimension][j],2))
        if e < 0.001:
            break
    return randon_latent_feature_matrices_users, randon_latent_feature_matrices_items.T

In [None]:
input_matrix_df = data
input_n = len(input_matrix_df)
input_m = len(input_matrix_df[0])
latent_features_dimension = 2

randon_latent_feature_matrices_users = np.random.rand(input_n,latent_features_dimension)
randon_latent_feature_matrices_items = np.random.rand(input_m,latent_features_dimension)
  
n_randon_latent_feature_matrices_users, n_randon_latent_feature_matrices_items = matrix_factorization(input_matrix_df, randon_latent_feature_matrices_users, randon_latent_feature_matrices_items, latent_features_dimension)
n_input_matrix_df = numpy.dot(n_randon_latent_feature_matrices_users, n_randon_latent_feature_matrices_items.T)
print(n_input_matrix_df)

100% (1000 of 1000) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


[[0.50354222 0.28554871 0.33663563]
 [0.01348589 0.01037226 0.01300307]
 [0.04376948 0.0517284  0.06863778]
 [0.47582769 0.3589173  0.44847363]
 [0.37397305 0.32626007 0.417115  ]
 [0.10255553 0.11640794 0.15380561]]


In [None]:
df = pd.read_csv('/content/drive/MyDrive/ML/Assignment_2/Fresh Dataset/ml-20m/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [None]:
df['userId'].unique()

array([     1,      2,      3, ..., 138491, 138492, 138493])

In [None]:
df.userId = df.userId.astype('category').cat.codes.values
df.movieId = df.movieId.astype('category').cat.codes.values

In [None]:
index=list(df['userId'].unique())
columns=list(df['movieId'].unique())
index=sorted(index)
columns=sorted(columns)
util_df=pd.pivot_table(data=df[:5000],values='rating',index='userId',columns='movieId')

In [None]:
util_df

movieId,0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,37,38,39,40,41,42,...,21881,21911,22015,22016,22069,22088,22090,22123,22124,22125,22162,22180,22473,22650,22652,22764,22801,22846,22900,22964,23081,23123,23239,23240,23297,23332,23369,23402,23434,23506,23520,23574,23719,23901,24202,24453,24608,24638,24828,25693
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
0,,3.5,,,,,,,,,,,,,,,,,,,,,,,,,,3.5,,,3.5,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,4.0,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,4.0,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,3.0,,,,4.0,,,,,,,,3.0,,,,,,,,,,,,,1.0,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,3.0,,,,,,,,,5.0,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,,3.0,,,,,,,,,,,,,4.5,,,2.5,,,,,,,,,,,,,,,,,,5.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
152,,,,,,,,,,,4.0,,,,,,,,,4.5,,,,4.5,,,,,,,,,,3.0,,4.5,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
153,,,,,,4.5,,,,4.0,,,,,,,,,,3.5,,,,,,,,,,,,,,,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
154,2.5,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,4.5,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
data = np.array(util_df.fillna(0))

In [None]:
data

In [None]:
input_matrix_df = data
input_n = len(input_matrix_df)
input_m = len(input_matrix_df[0])
latent_features_dimension = 2

randon_latent_feature_matrices_users = numpy.random.rand(input_n,latent_features_dimension)
randon_latent_feature_matrices_items = numpy.random.rand(input_m,latent_features_dimension)

Sample1, Sample2 = matrix_factorization(input_matrix_df, randon_latent_feature_matrices_users, randon_latent_feature_matrices_items, latent_features_dimension)
n_input_matrix_df = numpy.dot(Sample1, Sample2.T)

100% (1000 of 1000) |####################| Elapsed Time: 0:27:34 Time:  0:27:34


In [None]:
pd.DataFrame(n_input_matrix_df).to_csv('recommendation.csv')

In [None]:
pd.DataFrame(Sample1).to_csv('Sample1.csv')
pd.DataFrame(Sample2).to_csv('Sample2.csv')

In [None]:
pred = pd.read_csv('recommendation.csv')
movies = pd.read_csv('/content/drive/MyDrive/ML/Assignment_2/Fresh Dataset/ml-20m/movies.csv')


In [None]:
def Top_n(data, userID, n=27):
  data_type = data.to_numpy()[0][userID:]
  length = sorted(range(len(data_type)), key=lambda i: data_type[i])[-1*n:]
  movies_list = []
  for i in length:
    movies_list.append([movies['title'][i], movies['genres'][i]])
  return movies_list

In [None]:
Top_n(data= pred, userID= 1, n= 27)

[['You Can Count on Me (2000)', 'Drama|Romance'],
 ['Tora! Tora! Tora! (1970)', 'Action|Drama|War'],
 ['Meet the Parents (2000)', 'Comedy'],
 ['All Things Fair (Lust och fägring stor) (1995)', 'Drama|Romance|War'],
 ['Suburbans, The (1999)', 'Drama'],
 ['World Is Not Enough, The (1999)', 'Action|Adventure|Thriller'],
 ['Woo (1998)', 'Comedy|Romance'],
 ['Of Mice and Men (1992)', 'Drama'],
 ['Finding Forrester (2000)', 'Drama'],
 ['Little Princess, The (1939)', 'Children|Drama'],
 ['Halloween 5: The Revenge of Michael Myers (1989)', 'Horror'],
 ['Marnie (1964)', 'Drama|Mystery|Romance|Thriller'],
 ['Poltergeist III (1988)', 'Horror|Thriller'],
 ['Safe Men (1998)', 'Comedy'],
 ['Vampire in Brooklyn (1995)', 'Comedy|Horror|Romance'],
 ['Full Tilt Boogie (1997)', 'Documentary'],
 ['Highlander (1986)', 'Action|Adventure|Fantasy'],
 ['Halloween H20: 20 Years Later (Halloween 7: The Revenge of Laurie Strode) (1998)',
  'Horror|Thriller'],
 ['Steamboat Willie (1928)', 'Animation|Children|Comed