In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
print("Movies DataFrame:")
print(movies.head(10))
print("\nRatings DataFrame:")
print(ratings.head(10))

Movies DataFrame:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   
5        6                         Heat (1995)   
6        7                      Sabrina (1995)   
7        8                 Tom and Huck (1995)   
8        9                 Sudden Death (1995)   
9       10                    GoldenEye (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
5                        Action|Crime|Thriller  
6                               Comedy|Romance  
7                           Adventure|C

In [4]:
print("\nMovies DataFrame Columns:", movies.columns)
print("\nRatings DataFrame Columns:", ratings.columns)


Movies DataFrame Columns: Index(['movieId', 'title', 'genres'], dtype='object')

Ratings DataFrame Columns: Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


In [5]:
df = movies.merge(ratings, how='left', on='movieId')

In [6]:
print("\nMerged DataFrame:")
print(df.head(5))


Merged DataFrame:
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating     timestamp  
0     3.0     4.0  9.449194e+08  
1     6.0     5.0  8.582755e+08  
2     8.0     4.0  8.339819e+08  
3    10.0     4.0  9.434979e+08  
4    11.0     4.5  1.230859e+09  


In [7]:
print("\nShape of Merged DataFrame:")
print(df.shape)
print("\nInfo of Merged DataFrame:")
print(df.info())
print("\nFirst 5 rows of Merged DataFrame:")
print(df.head(5))


Shape of Merged DataFrame:
(20000797, 6)

Info of Merged DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000797 entries, 0 to 20000796
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   userId     float64
 4   rating     float64
 5   timestamp  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 915.6+ MB
None

First 5 rows of Merged DataFrame:
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating     timestamp  
0     3.0     4.0  9.449

In [8]:
print("\nMerged DataFrame Columns:", df.columns)


Merged DataFrame Columns: Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp'], dtype='object')


In [9]:
df.isnull().values.any()

True

In [10]:
df.isnull().sum()

movieId        0
title          0
genres         0
userId       534
rating       534
timestamp    534
dtype: int64

In [11]:
df.dropna(inplace=True)

In [12]:
print("\nColumns after dropping missing values:", df.columns)


Columns after dropping missing values: Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp'], dtype='object')


In [13]:
rating_counts = pd.DataFrame(df['title'].value_counts())
rating_counts.reset_index(inplace=True)
rating_counts.columns = ['title', 'count']
print("\nRating Counts DataFrame:")
print(rating_counts.head())


Rating Counts DataFrame:
                              title  count
0               Pulp Fiction (1994)  67310
1               Forrest Gump (1994)  66172
2  Shawshank Redemption, The (1994)  63366
3  Silence of the Lambs, The (1991)  63299
4              Jurassic Park (1993)  59715


In [14]:
rare_movies = rating_counts[rating_counts['count'] <= 50]['title']
print("\nRare Movies:")
print(rare_movies)


Rare Movies:
10472                          Claire of the Moon (1992)
10473                         Long Gray Line, The (1955)
10474                                  100 Rifles (1969)
10475               Sherlock Holmes in Washington (1943)
10476                   Thing About My Folks, The (2005)
                              ...                       
26724                                 Easy Wheels (1989)
26725                                   Ditirambo (1969)
26726    Scorching Winds (Garm Hava) (Garam Hawa) (1974)
26727                                 Serrallonga (2008)
26728                                   Innocence (2014)
Name: title, Length: 16257, dtype: object


In [15]:
common_movies = df[~df['title'].isin(rare_movies)]
print("\nColumns in common_movies DataFrame:", common_movies.columns)


Columns in common_movies DataFrame: Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp'], dtype='object')


In [16]:
user_movie_df = common_movies.pivot_table('rating', 'userId', 'title')
print("\nUser-Movie DataFrame:")
print(user_movie_df.head())


User-Movie DataFrame:
title   "Great Performances" Cats (1998)  $9.99 (2008)  \
userId                                                   
1.0                                  NaN           NaN   
2.0                                  NaN           NaN   
3.0                                  NaN           NaN   
4.0                                  NaN           NaN   
5.0                                  NaN           NaN   

title   'Hellboy': The Seeds of Creation (2004)  'Round Midnight (1986)  \
userId                                                                    
1.0                                         NaN                     NaN   
2.0                                         NaN                     NaN   
3.0                                         NaN                     NaN   
4.0                                         NaN                     NaN   
5.0                                         NaN                     NaN   

title   'Salem's Lot (2004)  'Til There Was 

In [17]:
random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=20).values[0])

In [18]:
random_user_df = user_movie_df[user_movie_df.index == random_user]
movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist()
print("\nNumber of movies watched by the random user:", len(movies_watched))


Number of movies watched by the random user: 29


In [19]:
movies_watched_df = user_movie_df[movies_watched]

In [20]:
user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ['userId', 'movie_count']
print("\nUser-Movie Count DataFrame:")
print(user_movie_count.head())


User-Movie Count DataFrame:
   userId  movie_count
0     1.0            1
1     2.0            2
2     3.0            5
3     4.0           13
4     5.0           10


In [21]:
users_same_movies = user_movie_count[user_movie_count['movie_count'] > 20]['userId']

In [22]:
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                      random_user_df[movies_watched]])
print("\nFinal DataFrame:")
print(final_df.head())
len(final_df)


Final DataFrame:
title   Bad Girls (1994)  Batman (1989)  Batman Forever (1995)  \
userId                                                           
54.0                 NaN            5.0                    4.0   
58.0                 NaN            NaN                    NaN   
69.0                 NaN            4.0                    3.0   
91.0                 2.0            4.0                    3.0   
101.0                NaN            3.0                    3.0   

title   Clean Slate (1994)  Clear and Present Danger (1994)  \
userId                                                        
54.0                   NaN                              4.0   
58.0                   NaN                              5.0   
69.0                   NaN                              3.0   
91.0                   NaN                              3.0   
101.0                  NaN                              4.0   

title   Client, The (1994)  Cliffhanger (1993)  Dances with Wolves (1990)  \


3937

In [23]:
final_df = final_df.drop_duplicates()

In [24]:
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
print("\nCorrelation DataFrame:")
print(corr_df.head())


Correlation DataFrame:
userId    userId  
77104.0   43765.0    -1.000000
119677.0  126905.0   -0.855905
44405.0   119677.0   -0.850871
88649.0   50442.0    -0.850170
75962.0   133956.0   -0.846888
dtype: float64


In [25]:
corr_df = pd.DataFrame(corr_df, columns=["correlation"])
corr_df.index.names = ['userId1', 'userId2']
corr_df = corr_df.reset_index()

In [26]:
sorted_corr_df = corr_df[(corr_df['userId1'] == random_user) & (corr_df['userId2'] != random_user)].sort_values(by='correlation', ascending=False)

In [27]:
top_50_users = sorted_corr_df.head(50)
print("\nTop 50 Similar Users:")
print(top_50_users)


Top 50 Similar Users:
         userId1   userId2  correlation
5077373  44315.0   33754.0     0.831782
5038774  44315.0   94806.0     0.738144
5024510  44315.0  117141.0     0.723210
5018126  44315.0   90733.0     0.717416
5017942  44315.0    1681.0     0.717274
5011143  44315.0   51411.0     0.711555
5006235  44315.0   62189.0     0.707686
4980889  44315.0   59365.0     0.690101
4973646  44315.0   48602.0     0.685542
4972661  44315.0  113017.0     0.684967
4960636  44315.0  111222.0     0.677956
4942093  44315.0   15733.0     0.667948
4934881  44315.0   19516.0     0.664384
4934559  44315.0  122224.0     0.664231
4926470  44315.0   28792.0     0.660201
4916553  44315.0   86839.0     0.655492
4903618  44315.0   44882.0     0.649764
4895424  44315.0   26914.0     0.646219
4868601  44315.0   32344.0     0.635265
4865917  44315.0   91709.0     0.634241
4863410  44315.0    2430.0     0.633259
4859436  44315.0   84898.0     0.631737
4857849  44315.0   16220.0     0.631135
4836076  44315.0 

In [28]:
top_50_ratings = top_50_users.merge(ratings, left_on='userId2', right_on='userId')

movie_recommendation = top_50_ratings.groupby('movieId').agg({'rating': 'mean'}).reset_index()

movie_recommendation = movie_recommendation.merge(movies, on='movieId')

recommended_movies = movie_recommendation.sort_values(by='rating', ascending=False)

print("\nRecommended Movies:")
print(recommended_movies.head(10))



Recommended Movies:
      movieId  rating                                              title  \
5521    50872     5.0                                 Ratatouille (2007)   
5734    61236     5.0          Waltz with Bashir (Vals im Bashir) (2008)   
5886    74685     5.0                                Crazies, The (2010)   
4148     6400     5.0  Murder on a Sunday Morning (Un coupable idéal)...   
5787    66279     5.0                                    Husbands (1970)   
4762     8154     5.0                              Dolce Vita, La (1960)   
5782    65514     5.0                                      Ip Man (2008)   
4794     8364     5.0  Baadasssss! (How to Get the Man's Foot Outta Y...   
990      1337     5.0                          Body Snatcher, The (1945)   
5749    62799     5.0                                Express, The (2008)   

                                   genres  
5521             Animation|Children|Drama  
5734      Animation|Documentary|Drama|War  
5886  Acti