<a href="https://colab.research.google.com/github/mcucii/ML-projects/blob/main/Movie%20Recommender%20Systems/Movie_Recommender_Systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [225]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer


# https://www.kaggle.com/code/rounakbanik/movie-recommender-systems/notebook

In [159]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [160]:
movie_dir = '/content/drive/MyDrive/ML_projects/Movie Recommender Systems/movie_dataset'

In [161]:
os.listdir(movie_dir)

['credits.csv',
 'links_small.csv',
 'keywords.csv',
 'movies_metadata.csv',
 'links.csv',
 'ratings.csv',
 'ratings_small.csv']



## SIMPLE RECOMMENDER
- generalized recommendations based on popularity
- not customized



In [162]:
md = pd.read_csv(os.path.join(movie_dir, 'movies_metadata.csv'))

  md = pd.read_csv(os.path.join(movie_dir, 'movies_metadata.csv'))


In [163]:
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [164]:
from ast import literal_eval
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x : [i['name'] for i in x] if isinstance(x, list) else [])

In [165]:
md['genres'].head()

Unnamed: 0,genres
0,"[Animation, Comedy, Family]"
1,"[Adventure, Fantasy, Family]"
2,"[Romance, Comedy]"
3,"[Comedy, Drama, Romance]"
4,[Comedy]



*IMDB formula for calculating top movies:*

$ WR = \frac{v}{m+v} * R + \frac{m}{v+m} * C$

- R = average rating for the movie (mean)
- C = the mean vote across the whole report
- v = number of votes for the movie
- m = min. votes required to be listed on top 250 -> more votes than 95% of the movies




In [166]:
# taking out nulls
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

np.float64(5.244896612406511)

In [167]:
m = vote_counts.quantile(0.95)   # 95 percentile
m

np.float64(434.0)

In [168]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x : str(x).split('-')[0] if x != np.nan else np.nan)

In [169]:
qualified = md[(md['vote_count'] != np.nan) & (md['vote_count'] >= m) & (md['vote_average'] != np.nan)][['title', 'year', 'vote_count', 'vote_average', 'genres', 'popularity']]
qualified['vote_count']= qualified['vote_count'].astype('int')
qualified['vote_average']= qualified['vote_average'].astype('int')


In [170]:
qualified.shape

(2274, 6)

In [171]:
def imdb_rating(x):
  v = x['vote_count']
  R = x['vote_average']
  return R*(v/(v+m)) + C*(m/(v+m))

In [172]:
qualified[qualified['title'] == 'Fight Club']

Unnamed: 0,title,year,vote_count,vote_average,genres,popularity
2843,Fight Club,1999,9678,8,[Drama],63.869599


In [173]:
qualified['imdb_rating'] = qualified.apply(imdb_rating, axis=1)

In [174]:
qualified[qualified['title'] == 'Fight Club']

Unnamed: 0,title,year,vote_count,vote_average,genres,popularity,imdb_rating
2843,Fight Club,1999,9678,8,[Drama],63.869599,7.881753


In [175]:
qualified = qualified.sort_values('imdb_rating', ascending=False).head(250)

In [176]:
# TOP MOVIES:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,genres,popularity,imdb_rating
15480,Inception,2010,14075,8,"[Action, Thriller, Science Fiction, Mystery, A...",29.108149,7.917588
12481,The Dark Knight,2008,12269,8,"[Drama, Action, Crime, Thriller]",123.167259,7.905871
22879,Interstellar,2014,11187,8,"[Adventure, Drama, Science Fiction]",32.213481,7.897107
2843,Fight Club,1999,9678,8,[Drama],63.869599,7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,"[Adventure, Fantasy, Action]",32.070725,7.871787
292,Pulp Fiction,1994,8670,8,"[Thriller, Crime]",140.950236,7.86866
314,The Shawshank Redemption,1994,8358,8,"[Drama, Crime]",51.645403,7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,"[Adventure, Fantasy, Action]",29.324358,7.861927
351,Forrest Gump,1994,8147,8,"[Comedy, Drama, Romance]",48.307194,7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,"[Adventure, Fantasy, Action]",29.423537,7.851924


- Now, let's build charts for each genre

In [177]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name='genre'

In [178]:
s.head()

Unnamed: 0,genre
0,Animation
0,Comedy
0,Family
1,Adventure
1,Fantasy


In [179]:
g_md = md.drop('genres', axis=1).join(s)
#g_md.head()

In [180]:
def chart_by_genre(genre, percentile=0.85):
  df = g_md[g_md['genre'] == genre]
  vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
  vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
  C = vote_averages.mean()
  m = vote_counts.quantile(percentile)

  qualified = df[(df['vote_average'].notnull()) & (df['vote_count'].notnull()) & (df['vote_count']>=m)][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
  qualified['vote_count'] = qualified['vote_count'].astype('int')
  qualified['vote_average'] = qualified['vote_average'].astype('int')

  qualified['rating'] = qualified.apply(lambda x : ((x['vote_count']/(x['vote_count'] + m))*x['vote_average'] + (m/(x['vote_count']+m))*C), axis=1)
  qualified = qualified.sort_values('rating', ascending=False).head(250)

  return qualified

In [181]:
top_romance_movies = chart_by_genre('Romance')
top_romance_movies.head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,rating
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,8.565285
351,Forrest Gump,1994,8147,8,48.307194,7.971357
876,Vertigo,1958,1162,8,18.20822,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.845107,7.745154
1132,Cinema Paradiso,1988,834,8,14.177005,7.744878
19901,Paperman,2012,734,8,7.198633,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.994281,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


## CONTENT BASED RECOMMENDER

In [182]:
# using smaller dataset because of the limited computer power
sm = pd.read_csv(os.path.join(movie_dir, 'links_small.csv'))
sm.shape

(9125, 3)

In [183]:
sm.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [184]:
sm = sm[sm['tmdbId'].notnull()]['tmdbId']
sm.head()

Unnamed: 0,tmdbId
0,862.0
1,8844.0
2,15602.0
3,31357.0
4,11862.0


In [185]:
md.shape

(45466, 25)

In [186]:
md = md[md['id'].apply(lambda x: str(x).isdigit())]
md['id'] = md['id'].astype(int)
md.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  md['id'] = md['id'].astype(int)


(45463, 25)

In [187]:
md['id'].head()

Unnamed: 0,id
0,862
1,8844
2,15602
3,31357
4,11862


In [188]:
smd = md[md['id'].isin(sm)]
smd.shape

(9099, 25)

#### 1. Recommender based on overview and tagline

In [189]:
smd['tagline'] = smd['tagline'].fillna('')
smd['overview'] = smd['overview'].fillna('')
types = []
for x in md['tagline']:
   if type(x) not in types:
    types.append(type(x))
print(types)

smd['description'] = smd['tagline'] + smd['overview']

[<class 'float'>, <class 'str'>]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['tagline'] = smd['tagline'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['overview'] = smd['overview'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['description'] = smd['tagline'] + smd['overview']


In [190]:
tf = TfidfVectorizer(ngram_range=(1,2))
tfidf_matrix = tf.fit_transform(smd['description'])

In [191]:
tfidf_matrix.shape

(9099, 286988)

In [192]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# result: NxN matrix, where (i,j) represents the similarity between i-th and j-th movie
cosine_sim

array([[1.        , 0.01181522, 0.00435556, ..., 0.00553204, 0.01076241,
        0.00264163],
       [0.01181522, 1.        , 0.01814577, ..., 0.01180284, 0.02476776,
        0.0058867 ],
       [0.00435556, 0.01814577, 1.        , ..., 0.01396444, 0.01429998,
        0.0072309 ],
       ...,
       [0.00553204, 0.01180284, 0.01396444, ..., 1.        , 0.09292427,
        0.00494754],
       [0.01076241, 0.02476776, 0.01429998, ..., 0.09292427, 1.        ,
        0.00906554],
       [0.00264163, 0.0058867 , 0.0072309 , ..., 0.00494754, 0.00906554,
        1.        ]])

In [193]:
smd = smd.reset_index()

In [194]:
titles = smd['title']
indices = pd.Series(smd.index, titles)

- Now, for every movie, we want to list (recommend) 10 more movies that one may like

In [195]:
def recommendN(title, N=10):
  index = indices[title]
  recommendations = sorted(enumerate(cosine_sim[index]), key=lambda x : x[1], reverse=True)[1:N+1] # 10 most similar
  rec_indices = [i[0] for i in recommendations]
  return titles.iloc[rec_indices]

In [196]:
recommendN('Toy Story', 10)

Unnamed: 0,title
2502,Toy Story 2
7535,Toy Story 3
6193,The 40 Year Old Virgin
2547,Man on the Moon
6627,Factory Girl
889,Rebel Without a Cause
4702,"What's Up, Tiger Lily?"
4935,A Midnight Clear
8922,The Man from U.N.C.L.E.
1938,Stepmom


## Recommender based on genre, keywords, cast and crew

In [197]:
credits = pd.read_csv(os.path.join(movie_dir, 'credits.csv'))
keywords = pd.read_csv(os.path.join(movie_dir, 'keywords.csv'))

In [198]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [199]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [200]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [201]:
md.shape

(45463, 25)

In [202]:
md = md.merge(keywords, on='id')
md = md.merge(credits, on='id')

In [203]:
smd = md[md['id'].isin(sm)]
smd.shape

(9219, 28)

1. CREW -> we will take into consideration only DIRECTOR
2. CAST -> we will take into consideration only TOP 3 actors that appear on the list

In [204]:
smd['crew'][0]

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

In [205]:
smd['crew'] = smd['crew'].apply(literal_eval)
#smd['crew'][0]
smd['cast'] = smd['cast'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['crew'] = smd['crew'].apply(literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['cast'] = smd['cast'].apply(literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['keywords'] = smd['keywords'].apply(literal_eval)


In [206]:
smd['cast_size'] = smd['cast'].apply(lambda x : len(x))
smd['crew_size'] = smd['crew'].apply(lambda x : len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['cast_size'] = smd['cast'].apply(lambda x : len(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['crew_size'] = smd['crew'].apply(lambda x : len(x))


In [207]:
smd['crew'][0]

[{'credit_id': '52fe4284c3a36847f8024f49',
  'department': 'Directing',
  'gender': 2,
  'id': 7879,
  'job': 'Director',
  'name': 'John Lasseter',
  'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f4f',
  'department': 'Writing',
  'gender': 2,
  'id': 12891,
  'job': 'Screenplay',
  'name': 'Joss Whedon',
  'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f55',
  'department': 'Writing',
  'gender': 2,
  'id': 7,
  'job': 'Screenplay',
  'name': 'Andrew Stanton',
  'profile_path': '/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f5b',
  'department': 'Writing',
  'gender': 2,
  'id': 12892,
  'job': 'Screenplay',
  'name': 'Joel Cohen',
  'profile_path': '/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f61',
  'department': 'Writing',
  'gender': 0,
  'id': 12893,
  'job': 'Screenplay',
  'name': 'Alec Sokolow',
  'profile_path': '/v79vlRYi94BZUQnkkyzn

In [208]:
# x - cast list
def get_director(x):
  for c in x:
    if c['job'] ==  'Director':
      return c['name']
  return np.nan

In [209]:
#get_director(smd['crew'][0])

In [210]:
smd['director'] = smd['crew'].apply(get_director)
#smd['director']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['director'] = smd['crew'].apply(get_director)


In [211]:
smd['cast'] = smd['cast'].apply(lambda x : [i['name'] for i in x] if isinstance(x,list) else [])
smd['cast'] = smd['cast'].apply(lambda x : x[:3] if len(x)>=3 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['cast'] = smd['cast'].apply(lambda x : [i['name'] for i in x] if isinstance(x,list) else [])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['cast'] = smd['cast'].apply(lambda x : x[:3] if len(x)>=3 else x)


In [212]:
smd['keywords'] = smd['keywords'].apply(lambda x : [i['name'] for i in x] if isinstance(x,list) else [])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['keywords'] = smd['keywords'].apply(lambda x : [i['name'] for i in x] if isinstance(x,list) else [])


In [213]:
# stripping spaces so name and surname are "a whole"
smd['cast'] = smd['cast'].apply(lambda x : [str.lower(i.replace(" ","")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x : str.lower(x.replace(" ","")))
# smd['director'] = [str(item).lower().replace(" ", "") for item in smd['director']]
# smd['cast'] = [[str(item).lower().replace(" ", "") for item in sublist] for sublist in smd['cast']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['cast'] = smd['cast'].apply(lambda x : [str.lower(i.replace(" ","")) for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['director'] = smd['director'].astype('str').apply(lambda x : str.lower(x.replace(" ","")))


In [214]:
# repeating director-s name 3 times to give it more "weight"
smd['director'] = smd['director'].apply(lambda x : [x,x,x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['director'] = smd['director'].apply(lambda x : [x,x,x])


In [215]:
s = smd.apply(lambda x : pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [216]:
s = s.value_counts()
s.head()

Unnamed: 0_level_0,count
keyword,Unnamed: 1_level_1
independent film,610
woman director,550
murder,399
duringcreditsstinger,327
based on novel,318


In [217]:
s = s[s>1]

In [218]:
def filter_keywords(x):
  words = []
  if isinstance(x, list):
    for i in x:
      if i in s:
        words.append(i)
    return words
  return []

In [219]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['keywords'] = smd['keywords'].apply(filter_keywords)


STEMMING -> the process of reducing inflected (or sometimes derived) words to their word stem

In [220]:
stemmer = SnowballStemmer('english')
#stemmer.stem('commentator')

In [221]:
smd['keywords'] = smd['keywords'].apply(lambda x : [stemmer.stem(i) for i in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['keywords'] = smd['keywords'].apply(lambda x : [stemmer.stem(i) for i in x])


In [222]:
smd['keywords'] = smd['keywords'].apply(lambda x : [str.lower(i.replace(" ", "")) for i in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['keywords'] = smd['keywords'].apply(lambda x : [str.lower(i.replace(" ", "")) for i in x])


In [223]:
smd['alldata'] = smd['director'] + smd['cast'] + smd['keywords'] + smd['genres']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smd['alldata'] = smd['director'] + smd['cast'] + smd['keywords'] + smd['genres']


In [229]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
#count_matrix = count.fit_transform(smd['alldata'])

AttributeError: 'list' object has no attribute 'lower'