In [3]:
import pandas as pd
from re import sub
from decimal import Decimal
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet


In [4]:
genome_scores = pd.read_csv('genome_scores.csv')
genome_tags = pd.read_csv('genome_tags.csv')
imdb_data = pd.read_csv('imdb_data.csv')
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [5]:
genome_scores.head(15)


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075
5,1,6,0.14675
6,1,7,0.0635
7,1,8,0.20375
8,1,9,0.202
9,1,10,0.03075


In [6]:
genome_tags.head(20)
genome_tags.info

<bound method DataFrame.info of       tagId           tag
0         1           007
1         2  007 (series)
2         3  18th century
3         4         1920s
4         5         1930s
...     ...           ...
1123   1124       writing
1124   1125         wuxia
1125   1126          wwii
1126   1127        zombie
1127   1128       zombies

[1128 rows x 2 columns]>

In [7]:
imdb_data.head(5)


Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion


In [8]:
links.head(5)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [9]:
movies.head(5)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
tags.head(5)


Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [11]:
test.head(5)


Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [12]:
train.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [13]:
joined = train.merge(movies,  how='left', on = 'movieId')

In [14]:
joined.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,5163,57669,4.0,1518349992,In Bruges (2008),Comedy|Crime|Drama|Thriller
1,106343,5,4.5,1206238739,Father of the Bride Part II (1995),Comedy
2,146790,5459,5.0,1076215539,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
3,106362,32296,2.0,1423042565,Miss Congeniality 2: Armed and Fabulous (2005),Adventure|Comedy|Crime
4,9041,366,3.0,833375837,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama|Horror|Mystery|Thriller


In [15]:
joined2 = joined.merge(imdb_data,  how='left', on = 'movieId')

In [16]:
joined2.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,title_cast,director,runtime,budget,plot_keywords
0,5163,57669,4.0,1518349992,In Bruges (2008),Comedy|Crime|Drama|Thriller,Elizabeth Berrington|Rudy Blomme|Olivier Bonjo...,Martin McDonagh,107.0,"$15,000,000",dwarf|bruges|irish|hitman
1,106343,5,4.5,1206238739,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion
2,146790,5459,5.0,1076215539,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi,Tommy Lee Jones|Will Smith|Rip Torn|Lara Flynn...,Lowell Cunningham,88.0,"$140,000,000",lingerie|michael jackson character|shorthaired...
3,106362,32296,2.0,1423042565,Miss Congeniality 2: Armed and Fabulous (2005),Adventure|Comedy|Crime,Sandra Bullock|Regina King|Enrique Murciano|Wi...,Marc Lawrence,115.0,"$45,000,000",female protagonist|cleave gag|good woman|fbi
4,9041,366,3.0,833375837,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama|Horror|Mystery|Thriller,Jeff Davis|Heather Langenkamp|Miko Hughes|Matt...,Wes Craven,112.0,"$8,000,000",freddy krueger|elm street|famous director as h...


In [17]:
joined2[:10000038]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,title_cast,director,runtime,budget,plot_keywords
0,5163,57669,4.0,1518349992,In Bruges (2008),Comedy|Crime|Drama|Thriller,Elizabeth Berrington|Rudy Blomme|Olivier Bonjo...,Martin McDonagh,107.0,"$15,000,000",dwarf|bruges|irish|hitman
1,106343,5,4.5,1206238739,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion
2,146790,5459,5.0,1076215539,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi,Tommy Lee Jones|Will Smith|Rip Torn|Lara Flynn...,Lowell Cunningham,88.0,"$140,000,000",lingerie|michael jackson character|shorthaired...
3,106362,32296,2.0,1423042565,Miss Congeniality 2: Armed and Fabulous (2005),Adventure|Comedy|Crime,Sandra Bullock|Regina King|Enrique Murciano|Wi...,Marc Lawrence,115.0,"$45,000,000",female protagonist|cleave gag|good woman|fbi
4,9041,366,3.0,833375837,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama|Horror|Mystery|Thriller,Jeff Davis|Heather Langenkamp|Miko Hughes|Matt...,Wes Craven,112.0,"$8,000,000",freddy krueger|elm street|famous director as h...
...,...,...,...,...,...,...,...,...,...,...,...
10000033,136395,99114,5.0,1521235092,Django Unchained (2012),Action|Drama|Western,Jamie Foxx|Christoph Waltz|Leonardo DiCaprio|K...,Quentin Tarantino,165.0,"$100,000,000",racial vengeance|racial violence|historically ...
10000034,140078,553,3.0,1002580977,Tombstone (1993),Action|Drama|Western,Kurt Russell|Val Kilmer|Sam Elliott|Bill Paxto...,Kevin Jarre,130.0,"$25,000,000",wyatt earp character|two gun holster|double gu...
10000035,154807,56782,4.0,1227674807,There Will Be Blood (2007),Drama|Western,Daniel Day-Lewis|Martin Stringer|Matthew Brade...,Paul Thomas Anderson,158.0,"$25,000,000",oil|misanthrope|loss of hearing|false prophet
10000036,85805,327,4.0,1479921530,Tank Girl (1995),Action|Comedy|Sci-Fi,Lori Petty|Ice-T|Naomi Watts|Don Harvey|Jeff K...,Alan Martin,104.0,"$25,000,000",desert|tank|21st century|2030s


In [18]:
joined2.isnull().sum()

userId                 0
movieId                0
rating                 0
timestamp              0
title                  0
genres                 0
title_cast       2971414
director         2969695
runtime          3020065
budget           3519283
plot_keywords    2977050
dtype: int64

In [19]:
joined3 = joined2.dropna(axis = 0)

In [20]:
# create the transform
vectorizer = TfidfVectorizer()
vect = vectorizer.fit_transform(joined3['genres'])

In [21]:
print(vect)

  (0, 21)	0.4887759849078087
  (0, 7)	0.41713762211485494
  (0, 5)	0.6100178867899585
  (0, 4)	0.4636511820216566
  (1, 4)	1.0
  (2, 9)	0.5669993711462794
  (2, 20)	0.5669993711462794
  (2, 0)	0.43100346468647516
  (2, 4)	0.413835039197627
  (3, 1)	0.5905392960270861
  (3, 5)	0.6424908919908456
  (3, 4)	0.4883326668948549
  (4, 16)	0.5899557287204663
  (4, 12)	0.6425489239559223
  (4, 21)	0.37192797472705064
  (4, 7)	0.31741565822817325
  (5, 7)	0.6688332241568333
  (5, 4)	0.743412481912952
  (6, 4)	1.0
  (7, 7)	0.6688332241568333
  (7, 4)	0.743412481912952
  (8, 1)	0.4817384039354594
  (8, 9)	0.5458000392323663
  (8, 20)	0.5458000392323663
  (8, 0)	0.4148888339321883
  :	:
  (6454984, 3)	0.41580275574069503
  (6454984, 15)	0.5502339584806565
  (6454984, 13)	0.446625760676327
  (6454984, 1)	0.29139939801372416
  (6454984, 7)	0.21679218140668238
  (6454985, 22)	0.8291370295194873
  (6454985, 7)	0.3739081534942676
  (6454985, 4)	0.4156013462804328
  (6454986, 4)	1.0
  (6454987, 5)	0.7961

In [22]:
joined3['genres'] = joined3['genres'].apply(lambda x: x.split('|'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined3['genres'] = joined3['genres'].apply(lambda x: x.split('|'))


In [None]:
joined3['title_cast'] = joined3['title_cast'].apply(lambda x: x.split('|'))

In [None]:
joined3['plot_keywords'] = joined3['plot_keywords'].apply(lambda x: x.split('|'))

In [None]:
joined3.head()

In [None]:
joined3['budget'] = joined3['budget'].apply(lambda x: int(sub(r'[^\d.]', '', x)))

In [None]:
joined3.head()