In [1]:
import numpy as np
import pandas as pd

In [2]:
movie_titles = pd.read_csv(r"..\datafiles\Movie_Id_Titles")
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [3]:
movie_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   item_id  1682 non-null   int64 
 1   title    1682 non-null   object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB


In [4]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(r"..\datafiles\u.data" , sep="\t", names=cols)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   item_id    100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [6]:
# merge the two dataframes on item_id column
final_df = pd.merge(df, movie_titles, on ='item_id')

In [7]:
len(final_df)

100000

In [8]:
final_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [9]:
#how many unique items?
final_df.item_id.nunique()

1682

In [10]:
#how many unique users?
final_df.user_id.nunique()

943

In [11]:
# movies with rating 5
mask = final_df['rating'] == 5
final_df[mask]

Unnamed: 0,user_id,item_id,rating,timestamp,title
2,226,242,5,883888671,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)
6,34,242,5,888601628,Kolya (1996)
11,354,242,5,891180399,Kolya (1996)
12,199,242,5,883782485,Kolya (1996)
...,...,...,...,...,...
99915,886,1467,5,876033987,"Saint of Fort Washington, The (1993)"
99960,60,1122,5,883326498,They Made Me a Criminal (1939)
99961,90,1201,5,891383687,Marlene Dietrich: Shadow and Light (1996)
99973,883,1448,5,891695570,My Favorite Season (1993)


#### Popularity based recommendation

In [None]:
# For each title , get an average rating and show in descending order

In [12]:
final_df.groupby('title')['rating'].mean().sort_values(ascending=False).head(10)

title
Marlene Dietrich: Shadow and Light (1996)            5.0
Prefontaine (1997)                                   5.0
Santa with Muscles (1996)                            5.0
Star Kid (1997)                                      5.0
Someone Else's America (1995)                        5.0
Entertaining Angels: The Dorothy Day Story (1996)    5.0
Saint of Fort Washington, The (1993)                 5.0
Great Day in Harlem, A (1994)                        5.0
They Made Me a Criminal (1939)                       5.0
Aiqing wansui (1994)                                 5.0
Name: rating, dtype: float64

In [None]:
#How many people have given rating to each title

In [13]:
final_df.groupby('title')['user_id'].count()

title
'Til There Was You (1997)                  9
1-900 (1994)                               5
101 Dalmatians (1996)                    109
12 Angry Men (1957)                      125
187 (1997)                                41
                                        ... 
Young Guns II (1990)                      44
Young Poisoner's Handbook, The (1995)     41
Zeus and Roxanne (1997)                    6
unknown                                    9
Á köldum klaka (Cold Fever) (1994)         1
Name: user_id, Length: 1664, dtype: int64

In [None]:
# Store both the above data in a new dataframe

In [14]:
# Store both the above data in a new dataframe
ratings = pd.DataFrame(final_df.groupby('title')['rating'].mean())
ratings['Num of Ratings'] = final_df.groupby('title')['rating'].count()
ratings.head()

Unnamed: 0_level_0,rating,Num of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,9
1-900 (1994),2.6,5
101 Dalmatians (1996),2.908257,109
12 Angry Men (1957),4.344,125
187 (1997),3.02439,41


In [15]:
ratings.sort_values(by=['Num of Ratings', 'rating'],
                               ascending=[False, False])

Unnamed: 0_level_0,rating,Num of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),4.358491,583
Contact (1997),3.803536,509
Fargo (1996),4.155512,508
Return of the Jedi (1983),4.007890,507
Liar Liar (1997),3.156701,485
...,...,...
"Very Natural Thing, A (1974)",1.000000,1
"Vie est belle, La (Life is Rosey) (1987)",1.000000,1
Wend Kuuni (God's Gift) (1982),1.000000,1
"Woman in Question, The (1950)",1.000000,1


In [None]:
# user movie matrix

In [17]:
# user movie matrix

movie_matrix = final_df.pivot_table(index='user_id', columns='title', values='rating')
movie_matrix.shape

(943, 1664)

In [16]:
movie_matrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [18]:
# get a particular column from the matrix
action = movie_matrix['Star Wars (1977)']
comedy = movie_matrix['Liar Liar (1997)']

In [23]:
# check correlation with other action movies
similar_to_star_wars =movie_matrix.corrwith(action)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


In [24]:
corr_starwars = pd.DataFrame(similar_to_star_wars, columns=['correlation'])
corr_starwars.dropna(inplace=True)
corr_starwars.head()

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
'Til There Was You (1997),0.872872
1-900 (1994),-0.645497
101 Dalmatians (1996),0.211132
12 Angry Men (1957),0.184289
187 (1997),0.027398


In [26]:
corr_starwars = corr_starwars.join(ratings['Num of Ratings'])
corr_starwars.head()

Unnamed: 0_level_0,correlation,Num of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),0.872872,9
1-900 (1994),-0.645497,5
101 Dalmatians (1996),0.211132,109
12 Angry Men (1957),0.184289,125
187 (1997),0.027398,41


In [28]:
corr_starwars[corr_starwars['Num of Ratings']>100].sort_values(by=['correlation'],
                               ascending=[False]).head(5)

Unnamed: 0_level_0,correlation,Num of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),1.0,583
"Empire Strikes Back, The (1980)",0.747981,367
Return of the Jedi (1983),0.672556,507
Raiders of the Lost Ark (1981),0.536117,420
Austin Powers: International Man of Mystery (1997),0.377433,130
