In [25]:
import pandas as pd

# CBF: Generate similar movies for each movie in the dataframe

In [26]:
tags = pd.read_csv('movies_cbf.csv',
                   lineterminator='\n')

In [27]:
tags.head()

Unnamed: 0,movie_id,title,year,overview,tagline,tags,tags_full
0,1,toy story,1995,"Led by Woody, Andy's toys live happily in his ...",,pixar pixar pixar animation pixar animated fun...,pixar pixar pixar animation pixar animated fun...
1,2,jumanji,1995,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,for children game animals joe johnston robin w...,for children game animals joe johnston robin w...
2,3,grumpier old men,1995,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,funniest movies comedinha de velhinhos engraã ...,funniest movies comedinha de velhinhos engraã ...
3,4,waiting to exhale,1995,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,girl movie comedy drama comedy drama comedy dr...,girl movie comedy drama comedy drama comedy dr...
4,5,father of the bride part ii,1995,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,steve martin pregnancy remake steve martin fam...,steve martin pregcy remake steve martin family...


In [28]:
tags=tags.astype({'tags_full':'string'})

In [29]:
tags = tags.drop_duplicates(subset=['movie_id'])

In [30]:
tags.shape

(3883, 7)

In [31]:
tags.duplicated(subset=['title']).value_counts()

False    3841
True       42
dtype: int64

In [32]:
tags = tags.astype({'year': 'string'})
tags['title_and_year'] = tags['title'] + ' (' + tags['year'] + ')'
                

In [33]:
tags.duplicated(subset=['title_and_year']).value_counts()

False    3883
dtype: int64

In [34]:
tags.tail()

Unnamed: 0,movie_id,title,year,overview,tagline,tags,tags_full,title_and_year
3878,3948,meet the parents,2000,"Greg Focker is ready to marry his girlfriend, ...",First comes love. Then comes the interrogation.,ben stiller comedy hilarious owen wilson ben s...,ben stiller comedy hilarious owen wilson ben s...,meet the parents (2000)
3879,3949,requiem for a dream,2000,The hopes and dreams of four ambitious people ...,,ass to ass heroin psychology depressing drugs ...,ass to ass heroin psychology depressing drugs ...,requiem for a dream (2000)
3880,3950,tigerland,2000,A group of recruits go through Advanced Infant...,The system wanted them to become soldiers. One...,colin farrell drama drama drama drama drama,colin farrell drama drama drama drama drama a ...,tigerland (2000)
3881,3951,two family house,2000,Buddy Visalo (Michael Rispoli) is a factory wo...,The only way to find out what you love is to r...,in netflix queue in netflix queue r drama dram...,in netflix queue in netflix queue r drama dram...,two family house (2000)
3882,3952,"contender, the",2000,"The vice president is dead, and as the preside...",Sometimes you can assassinate a leader without...,politics vice president washington dc gary old...,politics vice president washington dc gary old...,"contender, the (2000)"


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
matrix = tf.fit_transform(tags['tags_full'])

cosine_similarities = linear_kernel(matrix,matrix)
movie_titles = tags['title_and_year']
indices = pd.Series(tags.index, index=tags['title_and_year'])

In [36]:
def recommend_movies(original_title):
    # recommends 50 movies 
    idx = indices[original_title]
    sim_scores = list(enumerate(cosine_similarities[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:51]

    movie_indices = [i[0] for i in sim_scores]
    titles = movie_titles.iloc[movie_indices]
    return titles, movie_indices

In [37]:
recommend_movies('toy story (1995)')

(3045                                toy story 2 (1999)
 2286                              bug's life, a (1998)
 2225                                       antz (1998)
 2285                         rugrats movie, the (1998)
 1050            aladdin and the king of thieves (1996)
 2009                           jungle book, the (1967)
 3685    adventures of rocky and bullwinkle, the (2000)
 2073        american tail: fievel goes west, an (1991)
 2072                          american tail, an (1986)
 3542                             saludos amigos (1943)
 2011                         lady and the tramp (1955)
 592                                   pinocchio (1940)
 584                                     aladdin (1992)
 3682                                chicken run (2000)
 12                                        balto (1995)
 2033                           steamboat willie (1940)
 1262                                   fantasia (1940)
 2012                        little mermaid, the

In [38]:
recommend_movies('clueless (1995)')

(16                           sense and sensibility (1995)
 827                                           emma (1996)
 27                                      persuasion (1995)
 3010                                mansfield park (1999)
 6                                          sabrina (1995)
 797     rendezvous in paris (rendez-vous de paris, les...
 1589                           smile like yours, a (1997)
 3759                               mad about mambo (2000)
 2                                 grumpier old men (1995)
 2400                         peggy sue got married (1986)
 1511                               trial and error (1997)
 3284                           closer you get, the (2000)
 2416                                she's all that (1999)
 3289                           defending your life (1991)
 1426                 beautician and the beast, the (1997)
 3028                   shop around the corner, the (1940)
 67                     french twist (gazon maudit) (199

In [39]:
similar_movies_cbf = tags.copy(deep=True)

In [40]:
similar_movies_cbf= similar_movies_cbf.drop(columns =['tags_full', 'tags', 'tagline', 'overview', 'year'])

In [41]:
similar_movies_cbf.head()

Unnamed: 0,movie_id,title,title_and_year
0,1,toy story,toy story (1995)
1,2,jumanji,jumanji (1995)
2,3,grumpier old men,grumpier old men (1995)
3,4,waiting to exhale,waiting to exhale (1995)
4,5,father of the bride part ii,father of the bride part ii (1995)


In [42]:
similar_movies_cbf['similar_movies'] = None

In [43]:
for index, row in similar_movies_cbf.iterrows():
    title = str(row['title_and_year'])
    t, m_i = recommend_movies(title)
    tags.reset_index() 
    similar_movies = []
    
    for i in m_i: 
        similar_movie = int(tags.at[i, 'movie_id'])
        similar_movies.append(similar_movie)
    
    similar_movies_cbf.at[index, 'similar_movies'] = similar_movies

In [44]:
similar_movies_cbf.tail()

Unnamed: 0,movie_id,title,title_and_year,similar_movies
3878,3948,meet the parents,meet the parents (2000),"[643, 651, 856, 2923, 111, 1228, 1263, 1001, 7..."
3879,3949,requiem for a dream,requiem for a dream (2000),"[778, 1884, 1921, 3168, 3678, 441, 770, 824, 8..."
3880,3950,tigerland,tigerland (2000),"[770, 824, 826, 1155, 3145, 1222, 2562, 142, 1..."
3881,3951,two family house,two family house (2000),"[770, 824, 826, 1155, 3145, 3222, 309, 40, 374..."
3882,3952,"contender, the","contender, the (2000)","[1852, 3936, 3455, 2438, 229, 1834, 3370, 882,..."


In [45]:
similar_movies_cbf.to_csv('similar_movies_cbf.csv', index=False)

# CBF: Generate favorite movies for each user in the dataframe

In [46]:
import pandas as pd
ratings = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/ratings.csv')
ratings = ratings.iloc[: , 1:]

In [47]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [48]:
users = list(ratings['user_id'].unique())

In [49]:
user_favourite_movies = pd.DataFrame(); 
user_favourite_movies['user'] = None
user_favourite_movies['favourite_movies'] = None

rating_threshold = 4 

for user_id in users: 
    movies_rated_by_user = ratings[ratings['user_id'] == user_id]
    highly_rated = movies_rated_by_user.loc[movies_rated_by_user['rating'] >= rating_threshold]['movie_id'].values
    
    new_row = {'user': user_id, 'favourite_movies': list(highly_rated)}
    user_favourite_movies = user_favourite_movies.append(new_row, ignore_index = True)
    

In [50]:
user_favourite_movies.head()

Unnamed: 0,user,favourite_movies
0,1,"[1193, 3408, 2355, 1287, 2804, 594, 919, 595, ..."
1,2,"[1357, 3068, 1537, 2194, 648, 2268, 3468, 1210..."
2,3,"[3421, 1394, 104, 2735, 1210, 1079, 1615, 1291..."
3,4,"[3468, 2951, 1214, 1036, 260, 2028, 480, 1198,..."
4,5,"[2987, 2333, 1175, 2337, 1535, 1392, 866, 2770..."


In [51]:
user_favourite_movies.to_csv('user_favourite_movies.csv', index=False)

In [52]:
user_favourite_movies.dtypes

user                object
favourite_movies    object
dtype: object

In [66]:
similar_movies_cbf

Unnamed: 0,movie_id,title,title_and_year,similar_movies
0,1,toy story,toy story (1995),"[3114, 2355, 2294, 2354, 1064, 2078, 3754, 214..."
1,2,jumanji,jumanji (1995),"[56, 60, 126, 2162, 2161, 2005, 1967, 2399, 10..."
2,3,grumpier old men,grumpier old men (1995),"[7, 807, 1632, 1550, 3353, 3097, 3829, 3358, 3..."
3,4,waiting to exhale,waiting to exhale (1995),"[819, 1630, 1774, 106, 476, 2290, 568, 2239, 1..."
4,5,father of the bride part ii,father of the bride part ii (1995),"[643, 651, 856, 2923, 1001, 739, 2155, 275, 38..."
...,...,...,...,...
3878,3948,meet the parents,meet the parents (2000),"[643, 651, 856, 2923, 111, 1228, 1263, 1001, 7..."
3879,3949,requiem for a dream,requiem for a dream (2000),"[778, 1884, 1921, 3168, 3678, 441, 770, 824, 8..."
3880,3950,tigerland,tigerland (2000),"[770, 824, 826, 1155, 3145, 1222, 2562, 142, 1..."
3881,3951,two family house,two family house (2000),"[770, 824, 826, 1155, 3145, 3222, 309, 40, 374..."


# CBF: generate movie recommendations for each user

In [None]:
def list_to_str(list_obj):
  
  
    # Converting string to list
    res = ini_list.strip('][').split(', ')
  
    # printing final result and its type
    print ("final list", res)
    print (type(res))


In [96]:
CBF_recommendations = user_favourite_movies.copy(deep=True)

CBF_recommendations['CBF_recommendations'] = None 

count = 0
for index, row in CBF_recommendations.iterrows(): 
  user_recommendations = []
  favourite_movies = row['favourite_movies']

  for movie_id in favourite_movies: 
    movie_id = int(movie_id)
    new_recommendations = similar_movies_cbf[similar_movies_cbf['movie_id'] == movie_id]
    new_recommendations = (new_recommendations['similar_movies'])    
    user_recommendations.extend(new_recommendations.values[0])
  
  # deduplicate
  user_recommendations = list(dict.fromkeys(user_recommendations))
  CBF_recommendations.at[index, 'CBF_recommendations'] = user_recommendations

In [99]:
len(CBF_recommendations.at[0, 'CBF_recommendations'])

784

In [100]:
CBF_recommendations.to_csv('cbf_recommendations.csv', index=False)