In [0]:
# import necessary library
import pandas as pd
import numpy as np
import time
from google.cloud import storage
from google.colab import auth

In [0]:
# authenticate user, go to the link and copy the token, paste to the box under
auth.authenticate_user()

In [17]:
project_id = 'ie-project-ml'  # get the project ID from google console
!gcloud config set project {project_id}
!gsutil ls

Updated property [core/project].
gs://asia.artifacts.ie-project-ml.appspot.com/
gs://ie-project-ml.appspot.com/
gs://staging.ie-project-ml.appspot.com/


In [47]:
BUCKET_NAME = "ie-project-ml.appspot.com"     # change your bucket here
MOVIE_DATASETS = "movielen"                   # change your movie dataset folder here

# download all the file in the movielen
!gsutil -m cp -r gs://{BUCKET_NAME}/{MOVIE_DATASETS} .

Copying gs://ie-project-ml.appspot.com/movielen/links.csv...
Copying gs://ie-project-ml.appspot.com/movielen/README.txt...
/ [0/6 files][    0.0 B/  3.2 MiB]   0% Done                                    / [0/6 files][    0.0 B/  3.2 MiB]   0% Done                                    Copying gs://ie-project-ml.appspot.com/movielen/tags.csv...
Copying gs://ie-project-ml.appspot.com/movielen/ratings.csv...
Copying gs://ie-project-ml.appspot.com/movielen/movies.csv...


In [0]:
# change the folder of the datasets here
df_tags = pd.read_csv("movielen/tags.csv")
df_movies = pd.read_csv("movielen/movies.csv")
df_ratings = pd.read_csv("movielen/ratings.csv")
df_links = pd.read_csv("movielen/links.csv")

In [50]:
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [51]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [52]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [0]:
df_movies['genres'] = df_movies['genres'].apply(lambda x: x.split('|'))

In [0]:
df_tag_combine = df_tags.groupby(['movieId']).apply(lambda x: list(x['tag'])).reset_index().rename(columns={0:'tags'})

In [0]:
df_movies = pd.merge(df_movies, df_tag_combine, on='movieId',how='left')

In [56]:
df_movies.head()

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, fun]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[fantasy, magic board game, Robin Williams, game]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[moldy, old]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",
4,5,Father of the Bride Part II (1995),[Comedy],"[pregnancy, remake]"


In [0]:
df_movies['tags'] = df_movies['tags'].apply(lambda x: x if isinstance(x,list) else [])
df_movies['keywords'] = df_movies['genres'] + df_movies['tags']
df_movies['keywords'] = df_movies['keywords'].apply(lambda x : set([str.lower(i.replace(" ","")) for i in x]))
df_movies.set_index('movieId', inplace=True)

In [0]:
all_keywords = set()
for this_movie_keywords in df_movies['keywords']:
    all_keywords = all_keywords.union(this_movie_keywords)

The aim is to find the chief keyword of each movie. Mean the keyword that has the most predictive power to determine the meaning rating of the movies by all users

In [0]:
df_mxk = pd.DataFrame(0,index=df_movies.reset_index()['movieId'].unique(), columns=all_keywords)
df_mxk['rating'] = df_ratings.groupby('movieId')['rating'].mean()

for index, row in df_mxk.iterrows():
    df_mxk.loc[index,df_movies.loc[index]['keywords']] = 1
    
df_mxk['rating'].fillna(df_mxk['rating'].mean(), inplace=True)

# drop all the keyword that appear less than 5 movies
df_mxk = df_mxk.loc[:,df_mxk.sum() > 5]

In [62]:
# use decision tree to find the chief keyword of each movie
from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor(random_state=42)
X = df_mxk.drop('rating',axis=1).as_matrix()
y = df_mxk['rating'].as_matrix()

reg.fit(X,y)
keyword_scores = pd.Series(reg.feature_importances_, index = df_mxk.drop('rating',axis=1).columns)
keyword_frequency = df_mxk.sum(axis=0)

  after removing the cwd from sys.path.
  """


In [63]:
df_movies['chief_keyword'] = df_movies['keywords'].apply(lambda x: (keyword_scores[x]/keyword_frequency).idxmax())

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


Next, the aim is to find the similarity score between different chief keyword and then use it for finding similarity scores between movies that will then be used for content filtering. Similarity between scores have a very abstract meaning here, we are finding how much romance is similarity to war, or action to drama etc. For this, we use the technique introduce by Ted Dunning in this link 
http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html


1. First, create user cross keyword matrix where value in each cell is cumulative sum of the rating given by that user to that chief keyword (or the movie that has that chief keyword) across all movies rated by that user


In [0]:
all_chief_keywords = df_movies['chief_keyword'].unique()
df_uxk = pd.DataFrame(0, index=df_ratings['userId'].unique(), columns=all_chief_keywords)

In [0]:
for row in df_ratings.itertuples(index=True, name='Pandas'):
    this_movie_chief_keyword = df_movies.loc[getattr(row, 'movieId'), 'chief_keyword']
    this_user_this_movie_rating = getattr(row,'rating')
    this_user_id = getattr(row,'userId')
    df_uxk.loc[this_user_id, this_movie_chief_keyword] += this_user_this_movie_rating

create a co-rating matrix where value in each cell is the cumulative sum of the pair wise minimum of all keyword combinations for each user across all the users. It can best be understood by code:


In [0]:
nok = len(all_chief_keywords)
df_co_rating = pd.DataFrame(0, index=all_chief_keywords, columns=all_chief_keywords)

for index, row in df_uxk.iterrows():
    for i, first_keyword in enumerate(all_chief_keywords):
        for j in range(i+1, nok):
            second_keyword = all_chief_keywords[j]
            df_co_rating.loc[first_keyword, second_keyword] +=min(row[first_keyword],row[second_keyword])
            df_co_rating.loc[second_keyword,first_keyword] = df_co_rating.loc[first_keyword, second_keyword]

In [67]:
df_co_rating.head()

Unnamed: 0,animation,fantasy,romance,drama,pregnancy,action,remake,adventure,politics,horror,mafia,comedy,thriller,serialkiller,sci-fi,shakespeare,children,innetflixqueue,kidnapping,highschool,timetravel,animalmovie,twins,funny,england,journalism,wedding,twistending,prostitution,music,crime,war,documentary,quirky,mystery,musical,revenge,assassination,imax,superhero,...,religion,holocaust,predictable,disability,emotional,boxing,newyork,space,suspense,murder,worldwarii,anime,coenbrothers,mindfuck,police,beautiful,martialarts,vietnam,dark,family,business,drugs,dystopia,gritty,willferrell,magic,atmospheric,cinematography,intelligent,violence,depressing,philosophical,surreal,stylized,visuallyappealing,witty,socialcommentary,blackcomedy,(nogenreslisted),tense
animation,0.0,13520.5,7465.5,15861.5,161.0,8981.5,551.0,12083.5,682.0,5692.0,2106.0,10969.5,3550.5,224.5,13600.0,590.0,1602.5,1763.0,460.0,2057.0,3663.0,535.0,251.0,1949.5,1277.5,586.0,665.0,2309.0,718.0,1514.5,3043.0,7629.0,3450.0,2586.0,8681.0,5341.5,2006.5,1235.5,9439.5,2849.0,...,1028.0,56.5,532.0,82.0,471.0,908.5,825.0,671.5,724.0,313.0,919.0,1313.0,366.0,527.0,310.5,438.0,1812.5,8.5,414.5,115.0,109.0,486.0,274.0,20.5,676.0,2173.5,1053.5,199.5,417.5,262.0,89.5,20.5,404.5,860.5,875.5,700.0,155.0,70.5,159.5,43.0
fantasy,13520.5,0.0,8187.5,17407.5,160.0,10258.5,568.5,14169.5,740.0,6723.0,2419.0,12503.0,4011.5,239.0,16069.0,605.5,1607.5,1973.0,492.5,2198.5,3938.0,550.0,255.5,1990.0,1339.0,633.0,701.5,2524.0,785.0,1620.5,3244.5,9065.0,3524.5,2744.0,10400.5,5450.5,2114.5,1369.0,9368.0,2960.5,...,1066.5,58.5,554.0,85.0,522.0,971.0,885.0,678.5,790.5,324.5,1011.5,1296.5,388.0,554.0,332.0,431.5,1916.0,12.5,416.0,127.0,120.0,508.5,271.5,20.5,714.5,2224.5,1065.5,205.5,427.0,269.5,101.0,20.5,405.5,904.5,900.5,710.5,174.0,70.5,157.0,43.0
romance,7465.5,8187.5,0.0,9630.5,146.5,6611.5,542.0,7960.0,695.5,4333.5,1863.0,8117.0,3453.5,236.5,7793.5,588.0,1440.5,1777.5,441.0,2024.0,2947.5,497.5,249.0,1690.5,1225.0,562.5,682.5,2005.5,752.0,1477.5,2965.5,5797.0,2942.5,2248.5,6414.0,4674.0,1781.0,1180.5,5276.5,2332.5,...,944.5,55.5,463.5,80.0,448.0,874.5,839.5,600.5,596.0,296.0,832.5,980.0,367.5,484.0,292.0,356.0,1530.5,12.5,321.5,118.5,113.0,471.0,236.0,12.0,678.5,1695.5,909.5,192.5,398.5,251.0,94.5,17.0,368.0,676.0,709.5,651.0,172.0,57.5,124.0,30.5
drama,15861.5,17407.5,9630.5,0.0,167.5,12134.0,599.0,16720.0,767.0,7559.5,2762.0,16764.0,4360.5,253.5,22092.5,647.5,1626.0,2124.5,515.5,2267.0,4216.0,567.0,267.0,2099.5,1392.5,673.5,711.5,2819.0,797.0,1696.0,3374.0,11229.0,4594.0,2873.0,11568.5,5976.0,2291.0,1497.5,10744.0,3086.5,...,1119.5,58.5,563.5,92.0,586.0,1024.0,906.0,689.5,795.5,330.0,1080.5,1297.0,398.0,558.0,355.0,422.5,1998.0,12.5,421.0,131.5,120.0,535.5,268.5,20.5,739.5,2208.0,1124.0,214.0,445.5,307.5,101.0,20.5,424.0,885.0,912.5,741.5,176.0,65.5,151.0,39.0
pregnancy,161.0,160.0,146.5,167.5,0.0,159.0,117.5,166.5,88.0,126.0,88.5,163.0,143.0,45.5,167.5,57.0,88.5,65.0,69.0,88.5,150.5,77.5,60.0,77.0,90.5,78.0,70.5,101.0,115.5,122.0,112.0,130.5,71.5,74.0,153.5,125.5,113.5,94.0,124.0,110.5,...,61.0,15.5,41.0,25.0,28.5,45.0,64.0,51.0,44.0,43.5,54.5,36.0,43.0,40.5,31.0,34.0,61.5,3.5,23.0,23.0,10.0,32.5,10.0,0.0,50.0,65.0,54.5,32.0,39.5,16.0,11.5,4.5,30.5,54.5,46.0,33.5,14.0,8.0,8.0,2.0


In [0]:
df_co_rating.to_csv("movielen/df_co_rating.csv",sep=',',index=False)

Create a similarity matrix using Ted Dunning method

In [0]:
import scipy.stats

def sim_matrix(co):
    chief_keywords = co.columns
    df_sim = pd.DataFrame(index = co.index, columns = co.columns)
    f = co.sum()
    n = sum(f)
    
    for first_chief_keyword in chief_keywords:
        for second_chief_keyword in chief_keywords:
            k11 = co.loc[first_chief_keyword][second_chief_keyword]
            k12 = f[first_chief_keyword] - k11
            k21 = f[second_chief_keyword] - k11
            k22 = n - k12 - k21 + k11
            df_sim.loc[first_chief_keyword][second_chief_keyword], p, dof, expctd = scipy.stats.chi2_contingency([[k11,k12],[k21,k22]], lambda_='log-likelihood')
            if((k11/k21) < f[first_chief_keyword]/(n-f[first_chief_keyword])):
                df_sim.loc[first_chief_keyword][second_chief_keyword] = 0
    
    return df_sim
                    

In [0]:
df_sim_chief_keyword = sim_matrix(df_co_rating)

In [71]:
df_sim_chief_keyword.head()

Unnamed: 0,animation,fantasy,romance,drama,pregnancy,action,remake,adventure,politics,horror,mafia,comedy,thriller,serialkiller,sci-fi,shakespeare,children,innetflixqueue,kidnapping,highschool,timetravel,animalmovie,twins,funny,england,journalism,wedding,twistending,prostitution,music,crime,war,documentary,quirky,mystery,musical,revenge,assassination,imax,superhero,...,religion,holocaust,predictable,disability,emotional,boxing,newyork,space,suspense,murder,worldwarii,anime,coenbrothers,mindfuck,police,beautiful,martialarts,vietnam,dark,family,business,drugs,dystopia,gritty,willferrell,magic,atmospheric,cinematography,intelligent,violence,depressing,philosophical,surreal,stylized,visuallyappealing,witty,socialcommentary,blackcomedy,(nogenreslisted),tense
animation,0.0,4551.69,852.575,5963.23,0,1231.56,0.0,2926.95,0.0,258.231,0,2562.1,0.0,0.0,4001.97,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,494.068,42.1227,0,945.046,290.438,0.0,0.0,2566.37,0,...,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0
fantasy,4551.69,0.0,952.172,6635.36,0,1730.3,0.0,4361.04,0.0,551.844,0,3391.91,0.0,0.0,5998.83,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1005.71,6.312,0,1787.34,151.064,0.0,0.0,1779.48,0,...,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0
romance,852.575,952.172,0.0,1372.93,0,614.668,0.0,847.603,0.0,137.652,0,1459.91,61.2444,0.0,479.082,0.0,5.53997,0.1241,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,53.2463,275.287,102.152,0,458.67,507.63,0.0,0.0,219.362,0,...,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0
drama,5963.23,6635.36,1372.93,0.0,0,2478.26,0.0,5880.13,0.0,596.951,0,7515.08,0.0,0.0,13375.6,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1955.83,152.76,0,1849.11,109.853,0.0,0.0,2165.99,0,...,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0
pregnancy,0.0,0.0,0.0,0.0,0,0.0,128.777,0.0,40.4482,0.0,0,0.0,0.178691,38.3195,0.0,14.7936,10.0801,0.0,34.0021,0,1.09502,53.2539,73.4114,0,6.28345,39.6419,23.5914,0,86.0744,24.6353,0.0,0.0,0.0,0,0.0,0.0,2.86641,7.45168,0.0,0,...,0,13.9775,1.6716,28.4354,0,0,3.29825,1.77381,0.0207572,16.6333,0,0,7.84628,0.678246,2.32938,2.20459,0,2.88208,0.000473188,11.8489,0.00716569,0.037113,0,0,2.33555,0,0,13.1523,4.87644,0,1.91666,2.34363,0.657671,1.0866,0.00151688,0,0.278898,1.9386,0,0


In [73]:
!pip install scikit-surprise

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 4.1MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1678057 sha256=0c434b6e4ce03eea9fd37f19ae8f22232877e36a1cd8c6cc1c45dc6acac61770
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.0


In [74]:
from surprise import SVD, Reader, Dataset
from surprise.model_selection import cross_validate

reader = Reader()
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()

cross_validate(svd, data, measures=['RMSE','MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8857  0.8543  0.8733  0.8787  0.8752  0.8734  0.0105  
MAE (testset)     0.6798  0.6581  0.6678  0.6760  0.6726  0.6709  0.0075  
Fit time          5.69    5.21    5.10    5.12    4.97    5.22    0.25    
Test time         0.29    0.16    0.15    0.16    0.17    0.18    0.05    


{'fit_time': (5.690371036529541,
  5.211069583892822,
  5.09533166885376,
  5.117106914520264,
  4.974785327911377),
 'test_mae': array([0.67979487, 0.65807851, 0.66782064, 0.67602331, 0.67256605]),
 'test_rmse': array([0.88566288, 0.85428139, 0.87330833, 0.87874678, 0.87515921]),
 'test_time': (0.2872583866119385,
  0.16071224212646484,
  0.15024662017822266,
  0.1552143096923828,
  0.16922450065612793)}

In [75]:
def collaborative(userId):
    df_movies['est'] = df_movies.reset_index()['movieId'].apply(lambda x: svd.predict(userId,x).est)
    return df_movies.sort_values('est', ascending=False).head(10)
collaborative(1)

Unnamed: 0_level_0,title,genres,tags,keywords,chief_keyword,est
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
949,East of Eden (1955),[Drama],[],{drama},drama,5.0
924,2001: A Space Odyssey (1968),"[Adventure, Drama, Sci-Fi]","[Hal, space, aliens, apes, Arthur C. Clarke, a...","{relaxing, artificialintelligence, futuristic,...",imdbtop250,5.0
277,Miracle on 34th Street (1994),[Drama],[Christmas],"{christmas, drama}",christmas,5.0
692,Solo (1996),"[Action, Sci-Fi, Thriller]",[],"{thriller, action, sci-fi}",sci-fi,5.0
1616,"Peacemaker, The (1997)","[Action, Thriller, War]",[],"{thriller, action, war}",war,5.0
694,"Substitute, The (1996)","[Action, Crime, Drama]",[],"{crime, action, drama}",drama,5.0
6710,Dummy (2002),"[Comedy, Drama, Romance]",[vertriloquism],"{vertriloquism, romance, comedy, drama}",drama,5.0
905,It Happened One Night (1934),"[Comedy, Romance]",[Screwball],"{screwball, comedy, romance}",screwball,5.0
2372,Fletch Lives (1989),[Comedy],[],{comedy},comedy,5.0
585,"Brady Bunch Movie, The (1995)",[Comedy],[],{comedy},comedy,5.0


In [76]:
collaborative(1)

Unnamed: 0_level_0,title,genres,tags,keywords,chief_keyword,est
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
949,East of Eden (1955),[Drama],[],{drama},drama,5.0
924,2001: A Space Odyssey (1968),"[Adventure, Drama, Sci-Fi]","[Hal, space, aliens, apes, Arthur C. Clarke, a...","{relaxing, artificialintelligence, futuristic,...",imdbtop250,5.0
277,Miracle on 34th Street (1994),[Drama],[Christmas],"{christmas, drama}",christmas,5.0
692,Solo (1996),"[Action, Sci-Fi, Thriller]",[],"{thriller, action, sci-fi}",sci-fi,5.0
1616,"Peacemaker, The (1997)","[Action, Thriller, War]",[],"{thriller, action, war}",war,5.0
694,"Substitute, The (1996)","[Action, Crime, Drama]",[],"{crime, action, drama}",drama,5.0
6710,Dummy (2002),"[Comedy, Drama, Romance]",[vertriloquism],"{vertriloquism, romance, comedy, drama}",drama,5.0
905,It Happened One Night (1934),"[Comedy, Romance]",[Screwball],"{screwball, comedy, romance}",screwball,5.0
2372,Fletch Lives (1989),[Comedy],[],{comedy},comedy,5.0
585,"Brady Bunch Movie, The (1995)",[Comedy],[],{comedy},comedy,5.0


In [77]:
def hybrid(userId, title):
    title_to_id = df_movies.reset_index()[['movieId', 'title']].set_index('title')
    this_movie_id = title_to_id.loc[title]
    all_movieids = list(df_movies.index)
    sim_scores_series = pd.Series(0,index = all_movieids)
    for movieid in all_movieids:
        sim_scores_series.loc[movieid] = df_sim_chief_keyword.loc[df_movies.loc[this_movie_id,'chief_keyword'],df_movies.loc[movieid,'chief_keyword']].iloc[0]
        
    top_25_ids = sim_scores_series.sort_values(ascending=False)[:26].index
    df_movies_top25 = df_movies.loc[top_25_ids].reset_index()
    
    df_movies_top25['est'] = df_movies_top25['index'].apply(lambda x: svd.predict(userId,x).est)
    
    #Sort the movies in decreasing order of predicted rating
    df_movies_top25 = df_movies_top25.sort_values('est', ascending=False)
    
    #Return the top 10 movies as recommendations
    return df_movies_top25.head(10)

hybrid(2, 'Spider-Man (2002)')

Unnamed: 0,index,title,genres,tags,keywords,chief_keyword,est
4,1270,Back to the Future (1985),"[Adventure, Comedy, Sci-Fi]",[time travel],"{adventure, comedy, timetravel, sci-fi}",timetravel,4.114736
20,1214,Alien (1979),"[Horror, Sci-Fi]",[aliens],"{horror, aliens, sci-fi}",aliens,4.054838
0,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),"[Mystery, Sci-Fi, Thriller]","[time travel, time travel, Brad Pitt, Bruce Wi...","{timetravel, mystery, bradpitt, postapocalypti...",timetravel,4.054582
5,68358,Star Trek (2009),"[Action, Adventure, Sci-Fi, IMAX]","[future, lack of development, lack of story, q...","{lackofdevelopment, lackofstory, timetravel, f...",timetravel,4.038217
10,4571,Bill & Ted's Excellent Adventure (1989),"[Adventure, Comedy, Sci-Fi]",[time travel],"{adventure, comedy, timetravel, sci-fi}",timetravel,3.998952
15,1097,E.T. the Extra-Terrestrial (1982),"[Children, Drama, Sci-Fi]",[aliens],"{aliens, drama, children, sci-fi}",aliens,3.956974
6,589,Terminator 2: Judgment Day (1991),"[Action, Sci-Fi]","[apocalypse, Arnold Schwarzenegger, nuclear wa...","{scifimasterpiece, timetravel, suspense, actio...",timetravel,3.937041
1,1240,"Terminator, The (1984)","[Action, Sci-Fi, Thriller]","[Action, artificial intelligence, robots, Sci-...","{tense, specialeffects, timetravel, artificial...",timetravel,3.883662
14,1200,Aliens (1986),"[Action, Adventure, Horror, Sci-Fi]","[action, aliens, horror, sci-fi, space, space ...","{horror, suspense, adventure, action, space, s...",aliens,3.827339
19,1253,"Day the Earth Stood Still, The (1951)","[Drama, Sci-Fi, Thriller]",[aliens],"{thriller, aliens, drama, sci-fi}",aliens,3.792165


In [78]:
hybrid(2, 'Toy Story (1995)')

Unnamed: 0,index,title,genres,tags,keywords,chief_keyword,est
9,4061,The Man in the Moon (1991),"[Drama, Romance]",[],"{drama, romance}",drama,3.730516
0,7299,Monsieur Ibrahim (Monsieur Ibrahim et les fleu...,[Drama],[],{drama},drama,3.704767
3,4046,Friendly Persuasion (1956),[Drama],[Quakers],"{quakers, drama}",drama,3.698829
18,4067,Untamed Heart (1993),"[Drama, Romance]",[],"{drama, romance}",drama,3.685677
19,67734,Adventureland (2009),"[Comedy, Drama]",[],"{comedy, drama}",drama,3.681192
10,68194,"Damned United, The (2009)",[Drama],[],{drama},drama,3.67776
2,4043,At Close Range (1986),"[Crime, Drama]",[],"{crime, drama}",drama,3.630235
25,4041,"Officer and a Gentleman, An (1982)","[Drama, Romance]",[],"{drama, romance}",drama,3.621932
7,68269,"Young Victoria, The (2009)","[Drama, Romance]",[],"{drama, romance}",drama,3.617302
12,4062,Mystic Pizza (1988),"[Comedy, Drama, Romance]",[],"{romance, comedy, drama}",drama,3.597346


In [0]:
# merge the df_movie and df_link to get the final dataset
df_links = df_links[['movieId','imdbId']]
df_movies = df_movies.merge(df_links,left_on="movieId", right_on="movieId")

# write down df_movie_final to folder
df_movies.to_csv("movielen/df_movies_final.csv",sep=',',index=True)

In [0]:
# export the model to deploy
# first, write the df_sim_chief_keyword
import pickle
df_sim_chief_keyword.to_csv("movielen/df_sim_chief_keyword.csv", sep=',', index=False)
df_movies.to_csv("movielen/df_movies_final.csv",sep=',',index=True)

pickle.dump(svd,open('movielen/svd_prediction.pickle','wb'))

In [88]:
# put the data into bucket

TRAIN_FOLDER = 'movielen-train-model' # change your folder here

!gsutil cp movielen/df_sim_chief_keyword.csv gs://{BUCKET_NAME}/{TRAIN_FOLDER}
!gsutil cp movielen/df_movies_final.csv gs://{BUCKET_NAME}/{TRAIN_FOLDER}
!gsutil cp movielen/svd_prediction.pickle gs://{BUCKET_NAME}/{TRAIN_FOLDER}



Copying file://movielen/df_sim_chief_keyword.csv [Content-Type=text/csv]...
-
Operation completed over 1 objects/173.8 KiB.                                    
Copying file://movielen/df_movies_final.csv [Content-Type=text/csv]...
-
Operation completed over 1 objects/1.2 MiB.                                      
Copying file://movielen/svd_prediction.pickle [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/10.3 MiB.                                     
