# Basic Recommender on Movielens

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#column headers dataset
data_cols = ['user id','movie id','rating','timestamp']
item_cols = ['movie id','movie title','release date',
'video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime',
'Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller',
'War' ,'Western']
user_cols = ['user id','age','gender','occupation',
'zip code']

#import the data files onto dataframes
users = pd.read_csv('./ml100k/u.user', sep='|',
names=user_cols, encoding='latin-1')
item = pd.read_csv('./ml100k/u.item', sep='|',
names=item_cols, encoding='latin-1')
data = pd.read_csv('./ml100k/u.data', sep='\t',
names=data_cols, encoding='latin-1')

In [2]:
#print the head of these dataframes
print(users.head())
print(item.head())
print(data.head())

   user id  age gender  occupation zip code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213
   movie id        movie title release date  video release date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%2

### Details of all 3 choosen datafiles

In [3]:
print(users.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
user id       943 non-null int64
age           943 non-null int64
gender        943 non-null object
occupation    943 non-null object
zip code      943 non-null object
dtypes: int64(2), object(3)
memory usage: 36.9+ KB
None


In [4]:
print(item.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
movie id              1682 non-null int64
movie title           1682 non-null object
release date          1681 non-null object
video release date    0 non-null float64
IMDb URL              1679 non-null object
unknown               1682 non-null int64
Action                1682 non-null int64
Adventure             1682 non-null int64
Animation             1682 non-null int64
Childrens             1682 non-null int64
Comedy                1682 non-null int64
Crime                 1682 non-null int64
Documentary           1682 non-null int64
Drama                 1682 non-null int64
Fantasy               1682 non-null int64
Film-Noir             1682 non-null int64
Horror                1682 non-null int64
Musical               1682 non-null int64
Mystery               1682 non-null int64
Romance               1682 non-null int64
Sci-Fi                1682 non-null int64
Thriller 

In [5]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
user id      100000 non-null int64
movie id     100000 non-null int64
rating       100000 non-null int64
timestamp    100000 non-null int64
dtypes: int64(4)
memory usage: 3.1 MB
None


### Merging all three

In [6]:
dataset = pd.merge(pd.merge(item, data),users)
print(dataset.head())

   movie id            movie title release date  video release date  \
0         1       Toy Story (1995)  01-Jan-1995                 NaN   
1         4      Get Shorty (1995)  01-Jan-1995                 NaN   
2         5         Copycat (1995)  01-Jan-1995                 NaN   
3         7  Twelve Monkeys (1995)  01-Jan-1995                 NaN   
4         8            Babe (1995)  01-Jan-1995                 NaN   

                                            IMDb URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
2  http://us.imdb.com/M/title-exact?Copycat%20(1995)        0       0   
3  http://us.imdb.com/M/title-exact?Twelve%20Monk...        0       0   
4     http://us.imdb.com/M/title-exact?Babe%20(1995)        0       0   

   Adventure  Animation  Childrens    ...     Thriller  War  Western  user id  \
0          0          1          1    ...            

### Grouping 

In [7]:
# Group by title 
ratings_total = dataset.groupby('movie title').size()
print(ratings_total.head())

movie title
'Til There Was You (1997)      9
1-900 (1994)                   5
101 Dalmatians (1996)        109
12 Angry Men (1957)          125
187 (1997)                    41
dtype: int64


In [8]:
#Grouped titles ratings
ratings_mean = (dataset.groupby('movie title'))['movie title','rating'].mean()
print(ratings_mean.head())

                             rating
movie title                        
'Til There Was You (1997)  2.333333
1-900 (1994)               2.600000
101 Dalmatians (1996)      2.908257
12 Angry Men (1957)        4.344000
187 (1997)                 3.024390


In [9]:
#modify the dataframes to merge
ratings_total = pd.DataFrame({'movie title':ratings_total.index,
'total ratings': ratings_total.values})
ratings_mean['movie title'] = ratings_mean.index

In [10]:
#How many people viewed the movie 
final = pd.merge(ratings_mean, ratings_total).sort_values(by = 'total ratings',
ascending= False)
print(final.head())

        rating                movie title  total ratings
1398  4.358491           Star Wars (1977)            583
333   3.803536             Contact (1997)            509
498   4.155512               Fargo (1996)            508
1234  4.007890  Return of the Jedi (1983)            507
860   3.156701           Liar Liar (1997)            485


In [11]:
#ascertain minimum cutoff
print(final.describe())

            rating  total ratings
count  1664.000000    1664.000000
mean      3.077018      60.096154
std       0.780418      80.956484
min       1.000000       1.000000
25%       2.665094       7.000000
50%       3.162132      27.000000
75%       3.651808      80.250000
max       5.000000     583.000000


## Top 5 recommended movies

In [12]:
final = final[:200].sort_values(by = 'rating',
ascending = False)
print(final.head())

        rating                       movie title  total ratings
1281  4.466443           Schindler's List (1993)            298
273   4.456790                 Casablanca (1942)            243
1317  4.445230  Shawshank Redemption, The (1994)            283
1215  4.387560                Rear Window (1954)            209
1572  4.385768        Usual Suspects, The (1995)            267
