In [1]:
import pandas as pd
import numpy as np

In [2]:
genome_scores = pd.read_csv("data/ml-25m/genome-scores.csv",engine="python")
genome_tags = pd.read_csv("data/ml-25m/genome-tags.csv",engine="python")
links = pd.read_csv("data/ml-25m/links.csv",engine="python")
movies = pd.read_csv("data/ml-25m/movies.csv",engine="python")
ratings = pd.read_csv("data/ml-25m/ratings.csv",engine="python")
tags = pd.read_csv("data/ml-25m/tags.csv",engine="python")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [5]:
len(movies['movieId'].unique())

62423

In [6]:
ratings.shape

(25000095, 4)

In [7]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [9]:
len(ratings['userId'].unique())

162541

In [10]:
tags.shape

(1093360, 4)

In [11]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


data cleaning

In [12]:
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [13]:
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [14]:
tags.isnull().any()

userId       False
movieId      False
tag           True
timestamp    False
dtype: bool

In [15]:
tags = tags.dropna()

In [16]:
tags.isnull().any()

userId       False
movieId      False
tag          False
timestamp    False
dtype: bool

In [17]:
tags.shape

(1093344, 4)

In [18]:
len(tags['tag'].unique())

73050

In [19]:
tags_unique = tags['tag'].unique().tolist()
tags_unique

['classic',
 'sci-fi',
 'dark comedy',
 'great dialogue',
 "so bad it's good",
 'unreliable narrators',
 'tense',
 'artificial intelligence',
 'philosophical',
 'cliche',
 'musical',
 'horror',
 'unpredictable',
 'Oscar (Best Supporting Actress)',
 'adventure',
 'anime',
 'ecology',
 'fantasy',
 'Hayao Miyazaki',
 'Miyazaki',
 'post-apocalyptic',
 'bah',
 'Clint Eastwood',
 'music',
 'art',
 'contemporary art',
 'documentary',
 'aliens',
 'amazing photography',
 'Director: James Cameron',
 'first contact',
 'James Cameron',
 'Michael Biehn',
 'android(s)/cyborg(s)',
 'apocalypse',
 'Arnold Schwarzenegger',
 'franchise',
 'terminator',
 'time travel',
 'science fiction',
 'inferior sequel',
 'setting:London (UK) (future)',
 'unoriginal',
 'bad science',
 'good science',
 'Hans Zimmer',
 'philosophical issues',
 'space',
 'space travel',
 'time-travel',
 'visually appealing',
 'bittersweet',
 'boring',
 'conversation',
 'dialogue driven',
 'loneliness',
 'love story',
 'philosophy',
 'qu

analyzing data

In [20]:
from collections import Counter

def count_genres(movies_df):
    genre_counts = Counter()
    movies_df['genres'].str.split('|').apply(genre_counts.update)
    return genre_counts.most_common()

In [21]:
genre_counts = count_genres(movies)
genre_counts

[('Drama', 25606),
 ('Comedy', 16870),
 ('Thriller', 8654),
 ('Romance', 7719),
 ('Action', 7348),
 ('Horror', 5989),
 ('Documentary', 5605),
 ('Crime', 5319),
 ('(no genres listed)', 5062),
 ('Adventure', 4145),
 ('Sci-Fi', 3595),
 ('Children', 2935),
 ('Animation', 2929),
 ('Mystery', 2925),
 ('Fantasy', 2731),
 ('War', 1874),
 ('Western', 1399),
 ('Musical', 1054),
 ('Film-Noir', 353),
 ('IMAX', 195)]

In [22]:
comedy_movies = movies['genres'].str.contains('Comedy')
comedy_movies.shape

(62423,)

In [23]:
movies[comedy_movies]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
6,7,Sabrina (1995),Comedy|Romance
...,...,...,...
62402,209119,Up to the World (2014),Comedy|Drama
62411,209141,And They Lived Happily Ever After (1976),Comedy
62416,209153,Happy Flight (2008),Comedy|Drama
62417,209155,Santosh Subramaniam (2008),Action|Comedy|Romance


In [24]:
nonlisted_movies = movies['genres'].str.contains('(no genres listed)')
nonlisted_movies.shape

  nonlisted_movies = movies['genres'].str.contains('(no genres listed)')


(62423,)

In [25]:
movies[nonlisted_movies]

Unnamed: 0,movieId,title,genres
15881,83773,Away with Words (San tiao ren) (1999),(no genres listed)
16060,84768,Glitterbug (1994),(no genres listed)
16351,86493,"Age of the Earth, The (A Idade da Terra) (1980)",(no genres listed)
16491,87061,Trails (Veredas) (1978),(no genres listed)
17404,91246,Milky Way (Tejút) (2007),(no genres listed)
...,...,...,...
62400,209101,Hua yang de nian hua (2001),(no genres listed)
62401,209103,Tsar Ivan the Terrible (1991),(no genres listed)
62407,209133,The Riot and the Dance (2018),(no genres listed)
62415,209151,Mao Zedong 1949 (2019),(no genres listed)


rating

In [26]:
del ratings['timestamp']

In [27]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [28]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [29]:
movies_ratings = pd.merge(movies, ratings, on='movieId', how='inner')

In [30]:
movies_ratings

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0
...,...,...,...,...,...
25000090,209157,We (2018),Drama,119571,1.5
25000091,209159,Window of the Soul (2001),Documentary,115835,3.0
25000092,209163,Bad Poems (2018),Comedy|Drama,6964,4.5
25000093,209169,A Girl Thing (2001),(no genres listed),119571,3.0


In [31]:
rated_movies_rank = movies_ratings.groupby('title')['rating'].mean().sort_values(ascending=False)
rated_movies_rank

title
Full of Grace (2015)            5.0
Geordie (1955)                  5.0
Garfield In Paradise (1986)     5.0
Lost Woods (2012)               5.0
Garfield in the Rough (1984)    5.0
                               ... 
Caro Gorbaciov (1988)           0.5
Don't Let Go (2019)             0.5
Ice Queen (2005)                0.5
The Lodge (2019)                0.5
Robot Wars (1993)               0.5
Name: rating, Length: 58958, dtype: float64

In [32]:
most_rated_rank = movies_ratings.groupby('title').size().sort_values(ascending=False)
most_rated_rank

title
Forrest Gump (1994)                                        81491
Shawshank Redemption, The (1994)                           81482
Pulp Fiction (1994)                                        79672
Silence of the Lambs, The (1991)                           74127
Matrix, The (1999)                                         72674
                                                           ...  
Kojot (2017)                                                   1
Kochavva Paulo Ayyappa Coelho (2016)                           1
Knuckleface Jones (1999)                                       1
The Lodge (2019)                                               1
"BLOW THE NIGHT!" Let's Spend the Night Together (1983)        1
Length: 58958, dtype: int64

In [33]:
most_rated_rank.shape

(58958,)

In [34]:
most_rated_rank.describe()

count    58958.000000
mean       424.032277
std       2479.744597
min          1.000000
25%          2.000000
50%          6.000000
75%         36.750000
max      81491.000000
dtype: float64

In [35]:
accepted_movies = most_rated_rank[most_rated_rank > 10].index.tolist()

In [36]:
accepted_movies

['Forrest Gump (1994)',
 'Shawshank Redemption, The (1994)',
 'Pulp Fiction (1994)',
 'Silence of the Lambs, The (1991)',
 'Matrix, The (1999)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Jurassic Park (1993)',
 "Schindler's List (1993)",
 'Braveheart (1995)',
 'Fight Club (1999)',
 'Terminator 2: Judgment Day (1991)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Toy Story (1995)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 'Usual Suspects, The (1995)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'American Beauty (1999)',
 'Godfather, The (1972)',
 'Lord of the Rings: The Two Towers, The (2002)',
 'Lord of the Rings: The Return of the King, The (2003)',
 'Seven (a.k.a. Se7en) (1995)',
 'Fugitive, The (1993)',
 'Back to the Future (1985)',
 'Independence Day (a.k.a. ID4) (1996)',
 'Apollo 13 (1995)',
 'Fargo (1996)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (

In [37]:
accepted_movies_ratings = movies_ratings[movies_ratings['title'].isin(accepted_movies)]
accepted_movies_ratings

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0
...,...,...,...,...,...
24999961,208737,Midway (2019),Action|Drama|War,132358,2.0
24999962,208737,Midway (2019),Action|Drama|War,141420,3.0
24999963,208737,Midway (2019),Action|Drama|War,143568,3.5
24999964,208737,Midway (2019),Action|Drama|War,154088,4.0


In [38]:
accepted_movies_ratings_rank = accepted_movies_ratings.groupby('title')['rating'].mean().sort_values(ascending=False)

In [39]:
accepted_movies_ratings_rank

title
Planet Earth II (2016)                 4.483096
Planet Earth (2006)                    4.464797
Shawshank Redemption, The (1994)       4.413576
Band of Brothers (2001)                4.398599
Pollyanna (2003)                       4.384615
                                         ...   
Saving Christmas (2014)                0.921053
Justin Bieber's Believe (2013)         0.904762
SuperBabies: Baby Geniuses 2 (2004)    0.899038
Hip Hop Witch, Da (2000)               0.854839
Kidnapping, Caucasian Style (2014)     0.678571
Name: rating, Length: 23344, dtype: float64

In [40]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [41]:
del tags['timestamp']

In [42]:
tags

Unnamed: 0,userId,movieId,tag
0,3,260,classic
1,3,260,sci-fi
2,4,1732,dark comedy
3,4,1732,great dialogue
4,4,7569,so bad it's good
...,...,...,...
1093355,162521,66934,Neil Patrick Harris
1093356,162521,103341,cornetto trilogy
1093357,162534,189169,comedy
1093358,162534,189169,disabled


In [43]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [44]:
movies_non_genres = movies[['movieId','title']]

In [45]:
movies_non_genres

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
62418,209157,We (2018)
62419,209159,Window of the Soul (2001)
62420,209163,Bad Poems (2018)
62421,209169,A Girl Thing (2001)


In [46]:
movies_tags = pd.merge(movies_non_genres,tags, on='movieId', how='inner')
movies_tags

Unnamed: 0,movieId,title,userId,tag
0,1,Toy Story (1995),791,Owned
1,1,Toy Story (1995),1048,imdb top 250
2,1,Toy Story (1995),1361,Pixar
3,1,Toy Story (1995),3164,Pixar
4,1,Toy Story (1995),3164,time travel
...,...,...,...,...
1093339,209063,The Prep School Negro (2012),96399,Philadelphia
1093340,209063,The Prep School Negro (2012),96399,private school
1093341,209063,The Prep School Negro (2012),96399,quaker
1093342,209063,The Prep School Negro (2012),96399,racism


In [47]:
tagged = movies_tags.groupby('title').size().sort_values(ascending=False)
tagged

title
Star Wars: Episode IV - A New Hope (1977)    6180
Pulp Fiction (1994)                          4767
Inception (2010)                             4767
Interstellar (2014)                          3616
Fight Club (1999)                            3612
                                             ... 
Catacombs (1964)                                1
Sergio (2009)                                   1
Cat's Play (1974)                               1
Serpent (2017)                                  1
줄탁동시 (2012)                                     1
Length: 45208, dtype: int64

In [48]:
fun_tags = movies_tags['tag'].str.contains('funny')
fun_movies = movies_tags[fun_tags]

In [49]:
fun_movies

Unnamed: 0,movieId,title,userId,tag
7,1,Toy Story (1995),3448,funny
52,1,Toy Story (1995),7570,funny
82,1,Toy Story (1995),17069,funny
87,1,Toy Story (1995),19663,funny
94,1,Toy Story (1995),20346,funny
...,...,...,...,...
1091841,204302,Ek Ladki Ko Dekha Toh Aisa Laga (2019),118674,funny
1092379,204878,Dolemite Is My Name (2019),3842,funny
1092440,205054,Hustlers (2019),94239,funny
1092533,205287,Bacurau (2019),45004,funny


In [50]:
most_fun_movies = fun_movies.groupby('title').size().sort_values(ascending=False)
most_fun_movies

title
Intouchables (2011)                       65
Grand Budapest Hotel, The (2014)          62
Scott Pilgrim vs. the World (2010)        51
Hangover, The (2009)                      50
Zombieland (2009)                         50
                                          ..
Party, The (1968)                          1
Ek Ladki Ko Dekha Toh Aisa Laga (2019)     1
Patchwork (2015)                           1
Eddie Murphy Raw (1987)                    1
(500) Days of Summer (2009)                1
Length: 1417, dtype: int64

In [51]:
terminatordf_tag = movies_tags['title'].str.contains('Terminator: Dark Fate')

In [52]:
movies_tags[terminatordf_tag]

Unnamed: 0,movieId,title,userId,tag
1093156,207830,Terminator: Dark Fate (2019),57837,action
1093157,207830,Terminator: Dark Fate (2019),57837,cyborg
1093158,207830,Terminator: Dark Fate (2019),57837,female protagonist
1093159,207830,Terminator: Dark Fate (2019),57837,mexico
1093160,207830,Terminator: Dark Fate (2019),57837,sci-fi
1093161,207830,Terminator: Dark Fate (2019),57837,terminator
1093162,207830,Terminator: Dark Fate (2019),72360,prospect preferred
1093163,207830,Terminator: Dark Fate (2019),94239,altenrate future
1093164,207830,Terminator: Dark Fate (2019),94239,Arnold Schwarzenegger
1093165,207830,Terminator: Dark Fate (2019),94239,commentary on US border control policy
