In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

In [83]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            break

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [84]:
# load the data
train = import_data('C:/Users/Admin/Documents/EXPLORE/5.Unsupervised Learning/Predict/edsa-movie-recommendation-2022/train.csv')
test = import_data('C:/Users/Admin/Documents/EXPLORE/5.Unsupervised Learning/Predict/edsa-movie-recommendation-2022/test.csv')
genome_scores = import_data('C:/Users/Admin/Documents/EXPLORE/5.Unsupervised Learning/Predict/edsa-movie-recommendation-2022/genome_scores.csv')
genome_tags = import_data('C:/Users/Admin/Documents/EXPLORE/5.Unsupervised Learning/Predict/edsa-movie-recommendation-2022/tags.csv')
imdb_data = import_data('C:/Users/Admin/Documents/EXPLORE/5.Unsupervised Learning/Predict/edsa-movie-recommendation-2022/imdb_data.csv')
links = import_data('C:/Users/Admin/Documents/EXPLORE/5.Unsupervised Learning/Predict/edsa-movie-recommendation-2022/links.csv')
movies = import_data('C:/Users/Admin/Documents/EXPLORE/5.Unsupervised Learning/Predict/edsa-movie-recommendation-2022/movies.csv')
tags = import_data('C:/Users/Admin/Documents/EXPLORE/5.Unsupervised Learning/Predict/edsa-movie-recommendation-2022/tags.csv')

Memory usage of dataframe is 305.18 MB
Memory usage after optimization is: 133.52 MB
Decreased by 56.2%
Memory usage of dataframe is 76.29 MB
Memory usage after optimization is: 38.15 MB
Decreased by 50.0%
Memory usage of dataframe is 356.70 MB
Memory usage after optimization is: 118.90 MB
Decreased by 66.7%
Memory usage of dataframe is 33.37 MB
Memory usage after optimization is: 25.03 MB
Decreased by 25.0%
Memory usage of dataframe is 1.25 MB
Memory usage after optimization is: 1.14 MB
Decreased by 8.3%
Memory usage of dataframe is 1.43 MB
Memory usage after optimization is: 0.71 MB
Decreased by 50.0%
Memory usage of dataframe is 1.43 MB
Memory usage after optimization is: 1.19 MB
Decreased by 16.7%
Memory usage of dataframe is 33.37 MB
Memory usage after optimization is: 25.03 MB
Decreased by 25.0%


In [85]:
train.head()# merge

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [86]:
test.head() # merge

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [87]:
movies.head() #merge

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [88]:
imdb_data.head()# merge

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion


In [89]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [90]:
tags.head() # merge

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [91]:
genome_scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.028748
1,1,2,0.023743
2,1,3,0.0625
3,1,4,0.075745
4,1,5,0.140747


In [92]:
genome_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [93]:
print(genome_tags.shape)
print(tags.shape)

(1093360, 4)
(1093360, 4)


In [94]:
def merges(df):
    df = df.merge(movies, on = 'movieId', how= 'left')
    df = df.merge(imdb_data, on = 'movieId', how= 'left')
    df.drop(columns=['timestamp', 'runtime', 'budget'
                    ], inplace= True)
    return df        

In [95]:
df1 = merges(train)

In [96]:
df2 = df1.fillna("")
df2

Unnamed: 0,userId,movieId,rating,title,genres,title_cast,director,plot_keywords
0,5163,57669,4.0,In Bruges (2008),Comedy|Crime|Drama|Thriller,Elizabeth Berrington|Rudy Blomme|Olivier Bonjo...,Martin McDonagh,dwarf|bruges|irish|hitman
1,106343,5,4.5,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,fatherhood|doberman|dog|mansion
2,146790,5459,5.0,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi,Tommy Lee Jones|Will Smith|Rip Torn|Lara Flynn...,Lowell Cunningham,lingerie|michael jackson character|shorthaired...
3,106362,32296,2.0,Miss Congeniality 2: Armed and Fabulous (2005),Adventure|Comedy|Crime,Sandra Bullock|Regina King|Enrique Murciano|Wi...,Marc Lawrence,female protagonist|cleave gag|good woman|fbi
4,9041,366,3.0,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama|Horror|Mystery|Thriller,Jeff Davis|Heather Langenkamp|Miko Hughes|Matt...,Wes Craven,freddy krueger|elm street|famous director as h...
...,...,...,...,...,...,...,...,...
10000033,136395,99114,5.0,Django Unchained (2012),Action|Drama|Western,Jamie Foxx|Christoph Waltz|Leonardo DiCaprio|K...,Quentin Tarantino,racial vengeance|racial violence|historically ...
10000034,140078,553,3.0,Tombstone (1993),Action|Drama|Western,Kurt Russell|Val Kilmer|Sam Elliott|Bill Paxto...,Kevin Jarre,wyatt earp character|two gun holster|double gu...
10000035,154807,56782,4.0,There Will Be Blood (2007),Drama|Western,Daniel Day-Lewis|Martin Stringer|Matthew Brade...,Paul Thomas Anderson,oil|misanthrope|loss of hearing|false prophet
10000036,85805,327,4.0,Tank Girl (1995),Action|Comedy|Sci-Fi,Lori Petty|Ice-T|Naomi Watts|Don Harvey|Jeff K...,Alan Martin,desert|tank|21st century|2030s


In [79]:
columns = ['title','genres','title_cast','director','plot_keywords']

In [97]:
df2[columns].isnull().values.any()

False

In [99]:
df2[columns].isnull().sum()

title            0
genres           0
title_cast       0
director         0
plot_keywords    0
dtype: int64

In [106]:
# Merge the columns listed above into a new column named key_words fot the train data
def get_key_col(df):
    df['key_words'] = (pd.Series(df[['title', 'title_cast', 'director', 'plot_keywords', 'genres']].fillna('')
                      .values.tolist()).str.join(' '))
    return df

In [107]:
df2 = get_key_col(df2)

In [108]:
import re

def clean_feat(title):
    title = re.sub("[^a-zA-Z0-9 ]", " ", title)
    return title

In [109]:
df2["cleaned_key_feat"] = df2["key_words"].apply(clean_feat)

In [112]:
df2

Unnamed: 0,userId,movieId,rating,title,genres,title_cast,director,plot_keywords,key_words,cleaned_key_feat
0,5163,57669,4.0,In Bruges (2008),Comedy|Crime|Drama|Thriller,Elizabeth Berrington|Rudy Blomme|Olivier Bonjo...,Martin McDonagh,dwarf|bruges|irish|hitman,In Bruges (2008) Elizabeth Berrington|Rudy Blo...,In Bruges 2008 Elizabeth Berrington Rudy Blo...
1,106343,5,4.5,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,fatherhood|doberman|dog|mansion,Father of the Bride Part II (1995) Steve Marti...,Father of the Bride Part II 1995 Steve Marti...
2,146790,5459,5.0,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi,Tommy Lee Jones|Will Smith|Rip Torn|Lara Flynn...,Lowell Cunningham,lingerie|michael jackson character|shorthaired...,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Men in Black II a k a MIIB a k a MIB 2 ...
3,106362,32296,2.0,Miss Congeniality 2: Armed and Fabulous (2005),Adventure|Comedy|Crime,Sandra Bullock|Regina King|Enrique Murciano|Wi...,Marc Lawrence,female protagonist|cleave gag|good woman|fbi,Miss Congeniality 2: Armed and Fabulous (2005)...,Miss Congeniality 2 Armed and Fabulous 2005 ...
4,9041,366,3.0,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama|Horror|Mystery|Thriller,Jeff Davis|Heather Langenkamp|Miko Hughes|Matt...,Wes Craven,freddy krueger|elm street|famous director as h...,Wes Craven's New Nightmare (Nightmare on Elm S...,Wes Craven s New Nightmare Nightmare on Elm S...
...,...,...,...,...,...,...,...,...,...,...
10000033,136395,99114,5.0,Django Unchained (2012),Action|Drama|Western,Jamie Foxx|Christoph Waltz|Leonardo DiCaprio|K...,Quentin Tarantino,racial vengeance|racial violence|historically ...,Django Unchained (2012) Jamie Foxx|Christoph W...,Django Unchained 2012 Jamie Foxx Christoph W...
10000034,140078,553,3.0,Tombstone (1993),Action|Drama|Western,Kurt Russell|Val Kilmer|Sam Elliott|Bill Paxto...,Kevin Jarre,wyatt earp character|two gun holster|double gu...,Tombstone (1993) Kurt Russell|Val Kilmer|Sam E...,Tombstone 1993 Kurt Russell Val Kilmer Sam E...
10000035,154807,56782,4.0,There Will Be Blood (2007),Drama|Western,Daniel Day-Lewis|Martin Stringer|Matthew Brade...,Paul Thomas Anderson,oil|misanthrope|loss of hearing|false prophet,There Will Be Blood (2007) Daniel Day-Lewis|Ma...,There Will Be Blood 2007 Daniel Day Lewis Ma...
10000036,85805,327,4.0,Tank Girl (1995),Action|Comedy|Sci-Fi,Lori Petty|Ice-T|Naomi Watts|Don Harvey|Jeff K...,Alan Martin,desert|tank|21st century|2030s,Tank Girl (1995) Lori Petty|Ice-T|Naomi Watts|...,Tank Girl 1995 Lori Petty Ice T Naomi Watts ...


In [113]:
'''
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cm = CountVectorizer().fit_transform(df2["cleaned_key_feat"])
'''


MemoryError: 

In [None]:
X = df2["cleaned_key_feat"]

In [None]:
'''
cscm = cosine_similarity(cm)
'''

## COLLABORATIVE FILTERING

In [None]:
# Drop timestamp column for the train dataset
ratings = train.drop(columns='timestamp')
reader = Reader()
data = Dataset.load_from_df(ratings, reader)

In [None]:
svd = SVD()

trainset = data.build_full_trainset()

# Train the SVD model
svd.fit(trainset)


In [None]:
 # Extract the userIds and corresponding movieIds into a python list
userId = test['userId'].values.tolist()
movieId = test['movieId'].values.tolist()

# Create a range of numbers which is the size of list created above
count = len(userId)
rating = []
ids = []

# Make rating predictions with the corresponding userIds and movieIds, accessed through indexing
for i in range(count):
    result = svd.predict(userId[i], movieId[i])[3]
    
    # Store the predicted rating
    rating.append(result)
    
    # Prepare, save userId and movieId for submission
    ids.append(str(userId[i]) + '_' + str(movieId[i]))

In [None]:
# Convert ids and ratings to dataframe
colab_df =  pd.DataFrame({
        'Id': ids,
        'rating': rating
         })
colab_df.head()

In [None]:
# Convert dataframe to csv file for submission.
colab_df.to_csv('sub1_ES4.csv', index=False)

In [None]:
param_grid = {'n_epochs':[20,40], 
              'n_factors':[400,600], 
              'init_std_dev':[0.01, 0.005]} 
grid_SVD = GridSearchCV(SVD, cv=5, measures=['rmse'], param_grid=param_grid, n_jobs=-1, joblib_verbose=10)
grid_SVD.fit(data)
print('Best Score is: ')
print(grid_SVD.best_score['rmse'])
print('Best Parameters are: ')
print(grid_SVD.best_params['rmse'])

In [None]:
trainset = data.build_full_trainset()
svd = SVD(n_epochs = 40, n_factors = 400, init_std_dev = 0.005, random_state= 42, verbose=True)
svd.fit(trainset)