## Imports

In [1]:
import pandas as pd
import numpy as np
import os


# for part 1 to retrieve data
import requests
import zipfile

# Retrieving data

In [2]:
current_directory = os.getcwd()
data_directory = os.path.join(current_directory, '..', 'raw_data')
data_directory = os.path.abspath(data_directory)
data_directory

'/home/stkarydasphys/code/m-r-c-l/Movie-Recommendation-Engine/raw_data'

In [3]:
# Load each CSV file into a DataFrame
links_df = pd.read_csv(os.path.join(data_directory, 'ml-latest-small/links.csv'))
ratings_df = pd.read_csv(os.path.join(data_directory, 'ml-latest-small/ratings.csv'))
tags_df = pd.read_csv(os.path.join(data_directory, 'ml-latest-small/tags.csv'))
movies_df = pd.read_csv(os.path.join(data_directory, 'ml-latest-small/movies.csv'))

# Basic Inspection of dfs

## Inspecting links_df (contains movie ids on different movie dbs)

In [4]:
links_df.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


## Inspecting ratings_df (contains user ids and their 5-star ratings (0.5 star increments) that correspond to different movies)

In [5]:
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


## Inspecting tags_df (contains tags -assigned by the user- for each movie)

In [6]:
tags_df.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [23]:
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [24]:
tags_df.nunique()

userId         58
movieId      1572
tag          1589
timestamp    3411
dtype: int64

-----
Tags are user generated so no pre-existing structure, also very few users have provided anything as compared to the total users

## Inspecting movies_df (contains movie id, movie title and genre)

In [7]:
movies_df.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [55]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


# Creating new df that has all the info

In [21]:
merged_movies_df = pd.merge(left = movies_df, right = ratings_df, how = "outer")
merged_movies_df = pd.merge(left = merged_movies_df, right = tags_df, how = "outer")
merged_movies_df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,9.649827e+08,
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,8.474350e+08,
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1.106636e+09,
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1.510578e+09,
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1.305696e+09,
...,...,...,...,...,...,...,...
104532,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184.0,4.0,1.537109e+09,
104533,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184.0,3.5,1.537110e+09,
104534,193585,Flint (2017),Drama,184.0,3.5,1.537110e+09,
104535,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184.0,3.5,1.537110e+09,


In [22]:
merged_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104537 entries, 0 to 104536
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    104537 non-null  int64  
 1   title      100854 non-null  object 
 2   genres     100854 non-null  object 
 3   userId     104519 non-null  float64
 4   rating     100836 non-null  float64
 5   timestamp  104519 non-null  float64
 6   tag        3683 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 5.6+ MB


# Finding the average rating per genre (951 genres in total -some have more than one sub-genre) -note there are ~3700 movies with no genre, how would we use them?

In [28]:
merged_movies_df["genres"].nunique()

951

### All possible sub-genres

In [54]:
genre_list = ["Action","Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama",
              "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
len(genre_list)

18

## Action genre

### Boolean indexing according to genre and finding average rating of genre

In [40]:
searched_genre = "Action"

action_df = merged_movies_df[merged_movies_df["genres"].str.contains("Action", na = False)]
action_df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,tag
444,6,Heat (1995),Action|Crime|Thriller,1.0,4.0,9.649822e+08,
445,6,Heat (1995),Action|Crime|Thriller,6.0,4.0,8.455538e+08,
446,6,Heat (1995),Action|Crime|Thriller,11.0,5.0,9.021543e+08,
447,6,Heat (1995),Action|Crime|Thriller,18.0,4.0,1.460138e+09,
448,6,Heat (1995),Action|Crime|Thriller,23.0,4.0,1.107342e+09,
...,...,...,...,...,...,...,...
104513,189547,Iron Soldier (2010),Action|Sci-Fi,210.0,1.0,1.528486e+09,
104522,191005,Gintama (2017),Action|Adventure|Comedy|Sci-Fi,184.0,4.5,1.537109e+09,
104523,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,3.5,1.537099e+09,
104532,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184.0,4.0,1.537109e+09,


In [42]:
action_df["rating"].mean()

np.float64(3.447984331646809)

### Boolean indexing according to user and genre (picking userId = 1 as an example)

In [52]:
# boolean indexing userId = 1 and finding the mean of their ratings for this genre
action_df[action_df["userId"] == 1][["rating"]].mean()

rating    4.322222
dtype: float64

----
If user's average is higher than total average for this genre they are more likely to enjoy the movie