# Import Libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# utilities
import numpy as np
import pandas as pd

#pre-processing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#plotting
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline
sns.set()

from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/edsa-recommender-system-predict/links.csv
/kaggle/input/edsa-recommender-system-predict/train.csv
/kaggle/input/edsa-recommender-system-predict/genome_scores.csv
/kaggle/input/edsa-recommender-system-predict/tags.csv
/kaggle/input/edsa-recommender-system-predict/imdb_data.csv
/kaggle/input/edsa-recommender-system-predict/test.csv
/kaggle/input/edsa-recommender-system-predict/sample_submission.csv
/kaggle/input/edsa-recommender-system-predict/genome_tags.csv
/kaggle/input/edsa-recommender-system-predict/movies.csv


# Import Data

In [2]:
train_df = pd.read_csv(r"../input/edsa-recommender-system-predict/train.csv")
test_df =  pd.read_csv(r"../input/edsa-recommender-system-predict/test.csv")
scores = pd.read_csv(r"../input/edsa-recommender-system-predict/genome_scores.csv")
tags = pd.read_csv(r"../input/edsa-recommender-system-predict/genome_tags.csv")
imbd = pd.read_csv(r"../input/edsa-recommender-system-predict/imdb_data.csv")
links = pd.read_csv(r"../input/edsa-recommender-system-predict/links.csv")
movies = pd.read_csv(r"../input/edsa-recommender-system-predict/movies.csv")

In [3]:
tags.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [4]:
scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [5]:
train_df.sort_values('movieId')
movies.sort_values('movieId')
imbd.sort_values('movieId')

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion
...,...,...,...,...,...,...
27273,131254,Franz Dinda|Florian Lukas|Axel Stein|Kailas Ma...,Carsten Funke,85.0,,man wrapped in a towel|man wears a thong|male ...
27274,131256,Rick Kavanian|Axel Stein|Eva Habermann|Christo...,Matthias Dinter,83.0,"DEM5,800,000",ski|ski resort|ampersand in title|drink in title
27275,131258,Nam-gil Kim|Ye-jin Son|Hae-Jin Yoo|Kyeong-yeon...,Seong-il Cheon,130.0,,pirate|sword fight|korea|bandit
27276,131260,Martti Suosalo|Ilkka Koivula|Vexi Salmi|Riitta...,Timo Koivusalo,102.0,,friend|friendship|television show|restaurant


# Join DataFrames

Join the movies DataFrame with columns from imbd data.

In [6]:
movie_frames = [movies['movieId'], movies['title'], movies['genres'], imbd['title_cast'], imbd['director'], 
          imbd['runtime'], imbd['budget'], imbd['plot_keywords']]
joined_movies = pd.concat(movie_frames, axis = 1)
joined_movies.tail()

Unnamed: 0,movieId,title,genres,title_cast,director,runtime,budget,plot_keywords
62418,209157,We (2018),Drama,,,,,
62419,209159,Window of the Soul (2001),Documentary,,,,,
62420,209163,Bad Poems (2018),Comedy|Drama,,,,,
62421,209169,A Girl Thing (2001),(no genres listed),,,,,
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama,,,,,


In [7]:
import math

In [8]:
count_nan_in_df = movies.isnull().sum()
print (count_nan_in_df)

movieId    0
title      0
genres     0
dtype: int64


Combine the text features into a single text.

In [9]:
joined_movies['comb_text'] = (pd.Series(joined_movies[['title', 'genres', 'title_cast', 'director', 'budget', 'plot_keywords']]
                      .fillna('')
                      .values.tolist()).str.join(' '))

# Convienient indexes to between map book titles and indexes of 
# the books dataframe
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
joined_movies.head()

Unnamed: 0,movieId,title,genres,title_cast,director,runtime,budget,plot_keywords,comb_text
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation,Toy Story (1995) Adventure|Animation|Children|...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game,Jumanji (1995) Adventure|Children|Fantasy Robi...
2,3,Grumpier Old Men (1995),Comedy|Romance,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry,Grumpier Old Men (1995) Comedy|Romance Walter ...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...,Waiting to Exhale (1995) Comedy|Drama|Romance ...
4,5,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion,Father of the Bride Part II (1995) Comedy Stev...


Explore the result.

In [10]:
joined_movies['comb_text'][0]

'Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wallace Shawn|John Ratzenberger|Annie Potts|John Morris|Erik von Detten|Laurie Metcalf|R. Lee Ermey|Sarah Freeman|Penn Jillette|Jack Angel|Spencer Aste John Lasseter $30,000,000 toy|rivalry|cowboy|cgi animation'

Clean the combined text data.

In [11]:
# Function to remove/replace unwanted text such as characters,URLs etc

def clean(text):
    text=text.replace("|", " ")
    text=text.replace("(",' ')
    text=text.replace(")",' ')
    text=text.lower()  # Lowercase tweet
    
    return text

In [12]:
joined_movies['comb_text']=joined_movies['comb_text'].apply(clean)
joined_movies['comb_text'][0]

'toy story  1995  adventure animation children comedy fantasy tom hanks tim allen don rickles jim varney wallace shawn john ratzenberger annie potts john morris erik von detten laurie metcalf r. lee ermey sarah freeman penn jillette jack angel spencer aste john lasseter $30,000,000 toy rivalry cowboy cgi animation'

In [13]:
joined_movies.shape

(62423, 9)

In [14]:
joined_movies['comb_text'].head()

0    toy story  1995  adventure animation children ...
1    jumanji  1995  adventure children fantasy robi...
2    grumpier old men  1995  comedy romance walter ...
3    waiting to exhale  1995  comedy drama romance ...
4    father of the bride part ii  1995  comedy stev...
Name: comb_text, dtype: object

Sample the joined movies data taking only 40% of the data.

In [15]:
#sample_movies = joined_movies.sample(frac =.40) 
#sample_movies.shape

(24969, 9)

In [16]:
# Convienient indexes to between map book titles and indexes of 
# the books dataframe
titles = joined_movies['title']
indices = pd.Series(joined_movies.index, index=joined_movies['title'])

TF-IDF vectorisation process

In [29]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2),
                     min_df=0, stop_words='english')

# Produce a feature matrix, where each row corresponds to a book,
# with TF-IDF features as columns 
tf_authTags_matrix = tf.fit_transform(joined_movies['comb_text'])

In [30]:
cosine_sim_authTags = cosine_similarity(tf_authTags_matrix, 
                                        tf_authTags_matrix)
print (cosine_sim_authTags.shape)

(24969, 24969)


In [31]:
cosine_sim_authTags[:5]

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 4.66317737e-04, ...,
        0.00000000e+00, 3.05565396e-02, 0.00000000e+00],
       [0.00000000e+00, 4.66317737e-04, 1.00000000e+00, ...,
        0.00000000e+00, 2.30560098e-03, 7.81257550e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [19]:
sample_train = train_df.sample(frac =.50) 
sample_train.shape

(2500010, 4)

In [20]:
df_merge = pd.merge(sample_train, joined_movies, how='left', on='movieId')
df_merge.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,title_cast,director,runtime,budget,plot_keywords,comb_text
0,41804,78039,5.0,1432070706,Blue Valentine (2010),Drama|Romance,Jeanne Balibar|Guillaume Depardieu|Bulle Ogier...,Jacques Rivette,137.0,,seduction game|unrequited love|married woman|s...,blue valentine 2010 drama romance jeanne bal...
1,20282,6773,5.0,1284690632,"Triplets of Belleville, The (Les triplettes de...",Animation|Comedy|Fantasy,Jamal Udin Torabi|Enayatullah|Imran Paracha|Hi...,Tony Grisoni,88.0,"GBP75,000",ox slaughter|actual animal killed|watching a g...,"triplets of belleville, the les triplettes de..."
2,152866,114552,3.0,1516455824,"Boxtrolls, The (2014)",Adventure|Animation|Children|Comedy|Fantasy,Fabio De Luigi|Diego Abatantuono|Fabrizio Bent...,Gabriele Salvatores,90.0,,screenwriter|alternate reality|funeral|nonline...,"boxtrolls, the 2014 adventure animation chil..."
3,9300,60069,4.0,1283972168,WALL·E (2008),Adventure|Animation|Children|Romance|Sci-Fi,,,,,,wall·e 2008 adventure animation children rom...
4,90788,1235,5.0,966037739,Harold and Maude (1971),Comedy|Drama|Romance,,,,,,harold and maude 1971 comedy drama romance


In [21]:
df_merge.drop(['runtime', 'budget', 'plot_keywords'], axis = 1, inplace = True)
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2500010 entries, 0 to 2500009
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   userId      int64  
 1   movieId     int64  
 2   rating      float64
 3   timestamp   int64  
 4   title       object 
 5   genres      object 
 6   title_cast  object 
 7   director    object 
 8   comb_text   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 190.7+ MB


In [22]:
# First time data load.
#movies = df_merge.copy()
#ratings = train_df.copy()

# Organise a bit and store into feather-format
#movies.sort_values(by='movieId', inplace=True)
#movies.reset_index(inplace=True, drop=True)

# Categorize movies genres properly. Working later with +20MM rows of strings proved very resource consuming
#genres_unique = pd.DataFrame(movies['genres'].str.split('|').tolist()).stack().unique()
#genres_unique = pd.DataFrame(genres_unique, columns=['genre']) # Format into DataFrame to store later
#genres_unique.head()


Unnamed: 0,genre
0,Adventure
1,Animation
2,Children
3,Comedy
4,Fantasy


# Content filtering

In [37]:
def content_generate_rating_estimate(book_title, user, rating_data, k=20, threshold=0.0):
    # Convert the book title to a numeric index for our 
    # similarity matrix
    b_idx = indices[book_title]
    neighbors = [] # <-- Stores our collection of similarity values 
     
    # Gather the similarity ratings between each book the user has rated
    # and the reference book 
    for index, row in rating_data[rating_data['userId']==user].iterrows():
        sim = cosine_sim_authTags[b_idx-1, indices[row['title']]-1]
        neighbors.append((sim, row['rating']))
    # Select the top-N values from our collection
    k_neighbors = heapq.nlargest(k, neighbors, key=lambda t: t[0])

    # Compute the weighted average using similarity scoress and 
    # user item ratings. 
    simTotal, weightedSum = 0, 0
    for (simScore, rating) in k_neighbors:
        # Ensure that similarity ratings are above a given threshold
        if (simScore > threshold):
            simTotal += simScore
            weightedSum += simScore * rating
    try:
        predictedRating = weightedSum / simTotal
    except ZeroDivisionError:
        # Cold-start problem - No ratings given by user. 
        # We use the average rating for the reference item as a proxy in this case 
        predictedRating = np.mean(rating_data[rating_data['title']==book_title]['rating'])
    return predictedRating

Merge the train data with the corresponding movie information.

In [24]:
# Subset of ratings from user 24006
df_merge[df_merge['userId'] == 24006][3:10]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,title_cast,director,comb_text
81552,24006,1283,4.0,1171830307,High Noon (1952),Drama|Western,,,high noon 1952 drama western
104880,24006,2058,2.0,1171746301,"Negotiator, The (1998)",Action|Crime|Drama|Mystery|Thriller,Rick Moranis|Marcia Strassman|Robert Oliveri|D...,Stuart Gordon,"negotiator, the 1998 action crime drama myst..."
133836,24006,5377,3.0,1171701101,About a Boy (2002),Comedy|Drama|Romance,Theresa Russell|Benjamin Mouton|Antonio Fargas...,Ken Russell,about a boy 2002 comedy drama romance theres...
164193,24006,2409,2.5,1171630137,Rocky II (1979),Action|Drama,,,rocky ii 1979 action drama
202599,24006,6721,3.0,1171830881,Once Upon a Time in China (Wong Fei Hung) (1991),Action|Adventure|Drama,Nicolas Cage|Sam Rockwell|Alison Lohman|Bruce ...,Eric Garcia,once upon a time in china wong fei hung 199...
367656,24006,1378,2.0,1171630226,Young Guns (1988),Action|Comedy|Western,,,young guns 1988 action comedy western
374963,24006,6281,2.0,1171629890,Phone Booth (2002),Drama|Thriller,Victor Rasuk|Donna Maldonado|Kevin Rivera|Krys...,Peter Sollett,phone booth 2002 drama thriller victor rasuk...


In [25]:
# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration 
import heapq # <-- Efficient sorting of large lists

In [38]:
title = "High Noon (1952)"
actual_rating = df_merge[(df_merge['userId'] == 24006) & (df_merge['title'] == title)]['rating'].values[0]
pred_rating = content_generate_rating_estimate(book_title=title, user=24006, rating_data=df_merge)

In [39]:
print (f"Actual rating: \t\t {actual_rating}")
print (f"Predicted rating: \t {pred_rating}")

Actual rating: 		 4.0
Predicted rating: 	 3.7041499051267546


# Test Data

In [None]:
merged_test = pd.merge(test_df, joined_movies, how='left', on='movieId')
merged_test.head()

In [None]:
merged_test.drop(['genres', 'title_cast', 'director', 'runtime', 'budget', 'plot_keywords'], axis = 1, inplace = True)
merged_test.info()

In [None]:
merged_test.head(15)

In [None]:
# Subset of ratings from user 1
merged_test[merged_test['userId'] == 1][3:10]

This part gives problems!!!

In [None]:
title = "Lost in Translation (2003)"

pred_rating = content_generate_rating_estimate(book_title=title, user=1, rating_data=df_merge)

In [None]:
pred_rating

Predict on test data gives problems as above!!

In [None]:
#test_pred = []
#for u in range(len(merged_test['userId'])):
    #print(merged_test['title'][u], merged_test['userId'][u])
 #   title = merged_test['title'][u]
  #  userId = merged_test['userId'][u]
   # pred_rating = content_generate_rating_estimate(book_title= title, user=userId, rating_data=df_merge)
    #print(title)
    #print(pred_rating)

In [None]:
#test_pred

In [None]:
#df_merge.head()

In [None]:
#df = pd.DataFrame({'All_movies' : df_merge['movieId'].nunique().cumsum()})
# Plot histogram for each individual genre
#for genre in genres_unique['genre']:
 #   df_merge = movies[movies[genre]][['movieId']]
  #  df[genre]=df_merge.movieId.nunique().cumsum()
#df.fillna(method='ffill', inplace=True)

In [None]:
import surprise

In [None]:
#train_sample = train_df.sample(frac =.40)

In [None]:
reader = surprise.Reader(rating_scale = (0.5, 5.0))
data = surprise.Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)

In [None]:
# Model to train
algo_svd = surprise.SVD()
algo_mf_svd_pp = surprise.prediction_algorithms.matrix_factorization.SVDpp
algo_knn = surprise.KNNBaseline()

In [None]:
# Run 3-fold cross-validation and print results for MF SVDpp.
surprise.model_selection.cross_validate(algo_mf_svd_pp, data, measures=['RMSE'], cv=3, verbose=True, n_jobs=-1)


In [None]:
# Run 5-fold cross-validation and print results for KNN.
surprise.model_selection.cross_validate(algo_knn, data, measures=['RMSE'], cv=3, verbose=True, n_jobs=-1)

In [None]:
# Run 5-fold cross-validation and print results for SVD.
surprise.model_selection.cross_validate(algo_svd, data, measures=['RMSE'], cv=3, verbose=True, n_jobs=-1)

In [None]:
# Gridsearch for hyperparameters of best algo
surprise.model_selection.search.GridSearchCV(algo_class, param_grid, measures=[u'rmse'], cv=3, n_jobs=-1, joblib_verbose=True)

In [None]:
pre = []

for _, row in test_df.iterrows():
    x_unseen = algo_svdpp.predict(row['userId'], row['movieId'])
    pred = x_unseen[3]
    pre.append(pred)

In [None]:
test_df.head()

In [None]:
test_df['Id'] = test_df['userId'].map(str)+ "_" +test_df['movieId'].map(str)
test_df.head()

In [None]:
#Sellecting the index of the test dataframe
final_test= test_df["Id"]

In [None]:
#Creating the submission Dataframe
Final_Table = {'Id': final_test, 'rating':np.round(pre, 1)}
submission = pd.DataFrame(data=Final_Table)
submission = submission[['Id', 'rating']]

In [None]:
submission.to_csv("TestSubmission16.csv",index  = False) #wrting csv file

In [41]:
scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075
