Reference from WWcodeMovieRecommender.ipynb

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

%matplotlib inline
mpl.style.use('ggplot')

In [7]:
movies_df = pd.read_csv('datasets/data_movies.csv')

In [9]:
movies_df.head()

Unnamed: 0,movieId,name,categories
0,72,Minority Report,SciFi|Action
1,2,Going Away,SciFi|Kids|Horror|Fantasy
2,0,The Alien,SciFi|Kids|Horror
3,49,The Big Game,Sports|Musical|MartialArts
4,1,The Life of Bees,SciFi|Kids|Thriller|Documentary


In [10]:
movies_df.shape

(75, 3)

In [14]:
movies_df.categories.head()

0                       SciFi|Action
1          SciFi|Kids|Horror|Fantasy
2                  SciFi|Kids|Horror
3         Sports|Musical|MartialArts
4    SciFi|Kids|Thriller|Documentary
Name: categories, dtype: object

In [16]:
print(movies_df.categories.str.split('|').head())

0                         [SciFi, Action]
1          [SciFi, Kids, Horror, Fantasy]
2                   [SciFi, Kids, Horror]
3          [Sports, Musical, MartialArts]
4    [SciFi, Kids, Thriller, Documentary]
Name: categories, dtype: object


In [17]:
# remove the columns not need
del movies_df['name']
movies_df.head()

Unnamed: 0,movieId,categories
0,72,SciFi|Action
1,2,SciFi|Kids|Horror|Fantasy
2,0,SciFi|Kids|Horror
3,49,Sports|Musical|MartialArts
4,1,SciFi|Kids|Thriller|Documentary


In [18]:
movies_df[movies_df.categories.isnull()]

Unnamed: 0,movieId,categories


In [20]:
# import the ratings data
ratings_df = pd.read_csv('datasets/data_ratings.csv')
ratings_df.head()

Unnamed: 0,personId,movieId,progress,lastWatch
0,61,72,11%,3/21/20 8:20
1,71,73,14%,3/5/20 5:20
2,4,2,10%,1/1/70 0:00
3,16,3,2%,1/14/19 23:46
4,40,4,5%,8/5/19 16:12


In [21]:
ratings_df.shape

(1849, 4)

### Data Inspection & Visualization

In [23]:
# see the number of people that have watched movie 73
ratings_df[ratings_df.movieId == 73]

Unnamed: 0,personId,movieId,progress,lastWatch
1,71,73,14%,3/5/20 5:20
226,102,73,95%,8/30/19 20:00
1812,0,73,9%,1/29/20 19:37
1813,12,73,3%,9/5/18 2:59
1814,34,73,35%,1/16/20 14:26
1815,61,73,5%,2/26/19 19:34
1816,64,73,1%,8/24/18 14:36
1817,70,73,5%,10/10/18 3:34
1818,76,73,2%,11/1/18 0:52
1819,102,73,2%,10/9/18 3:56


In [24]:
# build a cross-tabluation table that shows the frequency with certain groups of data appear
# show which movies a person watched
pd.crosstab(ratings_df.personId, ratings_df.movieId)

movieId,0,1,2,3,4,5,6,7,8,9,...,197,198,199,200,201,202,203,204,205,206
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
294,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
296,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
ratings_df.dtypes

personId      int64
movieId       int64
progress     object
lastWatch    object
dtype: object

In [27]:
# data conversion, convert lastWatch string to date object
ratings_df['lastWatch']=pd.to_datetime(ratings_df['lastWatch'])

In [28]:
ratings_df.dtypes

personId              int64
movieId               int64
progress             object
lastWatch    datetime64[ns]
dtype: object

In [29]:
ratings_df.head()

Unnamed: 0,personId,movieId,progress,lastWatch
0,61,72,11%,2020-03-21 08:20:00
1,71,73,14%,2020-03-05 05:20:00
2,4,2,10%,1970-01-01 00:00:00
3,16,3,2%,2019-01-14 23:46:00
4,40,4,5%,2019-08-05 16:12:00


In [30]:
# convert progress percentage string to numeric data
ratings_df['progress'] = ratings_df['progress'].str.rstrip('%').astype('float') / 100.0

In [31]:
ratings_df.dtypes

personId              int64
movieId               int64
progress            float64
lastWatch    datetime64[ns]
dtype: object

In [32]:
ratings_df.head()

Unnamed: 0,personId,movieId,progress,lastWatch
0,61,72,0.11,2020-03-21 08:20:00
1,71,73,0.14,2020-03-05 05:20:00
2,4,2,0.1,1970-01-01 00:00:00
3,16,3,0.02,2019-01-14 23:46:00
4,40,4,0.05,2019-08-05 16:12:00


In [34]:
movies_df.head()

Unnamed: 0,movieId,categories
0,72,SciFi|Action
1,2,SciFi|Kids|Horror|Fantasy
2,0,SciFi|Kids|Horror
3,49,Sports|Musical|MartialArts
4,1,SciFi|Kids|Thriller|Documentary


In [35]:
ratings_df.head()

Unnamed: 0,personId,movieId,progress,lastWatch
0,61,72,0.11,2020-03-21 08:20:00
1,71,73,0.14,2020-03-05 05:20:00
2,4,2,0.1,1970-01-01 00:00:00
3,16,3,0.02,2019-01-14 23:46:00
4,40,4,0.05,2019-08-05 16:12:00


In [36]:
# count the amount of times a personId appears in the dataset 
# help us understand who watches the most movies
ratings_df.groupby('personId').size()

personId
0       8
1       3
2       1
3       5
4      33
       ..
293     3
294     4
295     1
296     2
297    14
Length: 287, dtype: int64

In [38]:
# print row for a particular person (ie person 61) = show the movies they've watch
ratings_df.loc[ratings_df['personId'] == 61]

Unnamed: 0,personId,movieId,progress,lastWatch
0,61,72,0.11,2020-03-21 08:20:00
5,61,5,0.43,1970-01-01 00:00:00
14,61,1,0.04,2018-04-13 03:02:00
32,61,4,0.07,2018-07-19 08:33:00
45,61,5,0.08,2019-11-14 20:42:00
69,61,21,0.97,2019-03-04 00:32:00
113,61,10,0.04,2018-05-22 23:30:00
148,61,15,0.02,2020-04-02 05:13:00
207,61,54,0.24,2019-11-13 01:09:00
371,61,5,0.74,2019-06-16 05:01:00


In [39]:
# count the amount of times a movieId appears in the dataset
# see which movies are the most popular
ratings_df.groupby('movieId').size()

movieId
0       1
1       2
2       3
3      10
4       4
       ..
202     1
203     1
204     1
205     1
206     1
Length: 207, dtype: int64

In [41]:
# print rows for a particular movie (ie movie 5 - The Escape to Witch Mountain) - show the people that have watched it
ratings_df.loc[ratings_df['movieId'] == 5]

Unnamed: 0,personId,movieId,progress,lastWatch
5,61,5,0.43,1970-01-01 00:00:00
18,123,5,0.22,1970-01-01 00:00:00
33,70,5,0.80,2020-03-13 04:56:00
45,61,5,0.08,2019-11-14 20:42:00
360,4,5,0.05,2018-07-05 02:08:00
...,...,...,...,...
436,276,5,0.02,2019-10-22 09:22:00
437,277,5,0.52,2020-02-14 05:28:00
438,282,5,0.01,2020-03-16 21:07:00
439,286,5,0.94,2019-10-16 19:37:00


In [None]:
# calculate the average watch time (ie progress) percentage by category per person
# figure out person's favorite category
# this function can be limited by a set amount of categories (for now SciFi and Drama)

# function to get the category progress
def get_category_watch_time(ratings, movies, categories, columns_name):
    category_progress = pd.DataFrame(columns=['personId'])
    
    #add personId to list of columns
    column_names.insert(0, 'personId')
    
    for category in categories: # loop for SciFi and Drama categories
        category_progress = cateh