In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import linear_kernel

In [2]:
df=pd.read_csv('master_dataset.csv')
df=df[:5000]

In [3]:

def get_director(x):
    """
    Extract the Name of the Director for a movie if it is present inside the job
    
    """
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [4]:
required_columns=["genres", "keywords", "overview", "title", "popularity","release_date", "crew"]
df=df[required_columns]
df.head(2)

Unnamed: 0,genres,keywords,overview,title,popularity,release_date,crew
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","Led by Woody, Andy's toys live happily in his ...",Toy Story,21.946943,1995-10-30,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",When siblings Judy and Peter discover an encha...,Jumanji,17.015539,1995-12-15,"[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."


In [5]:
import ast
df['crew']      = df['crew'].apply(ast.literal_eval)
df['director']  = df['crew'].apply(get_director)
df['keywords'] = df['keywords'].apply(ast.literal_eval)
df['keywords'] = df['keywords'].apply(lambda lst: [d['name'] for d in lst if 'name' in d])
from nltk.stem.snowball import SnowballStemmer

# Creating a stemmer object for English
stemmer = SnowballStemmer('english')

# Stem each keyword
df['keywords'] = df['keywords'].apply(
    lambda x: [stemmer.stem(i) for i in x if len(i) > 1]
)

# Convert to lowercase and remove spaces
df['keywords'] = df['keywords'].apply(
    lambda x: [i.replace(" ", "").lower() for i in x]
)

df['genres'] = df['genres'].apply(ast.literal_eval)
df['genres'] = df['genres'].apply(lambda lst: [d['name'] for d in lst if 'name' in d])
df['keywords'] = df['keywords'].apply(lambda x: ', '.join(x))
df['genres'] = df['genres'].apply(lambda x: ', '.join(x))
df.head(2)

Unnamed: 0,genres,keywords,overview,title,popularity,release_date,crew,director
0,"Animation, Comedy, Family","jealousi, toy, boy, friendship, friend, rivalr...","Led by Woody, Andy's toys live happily in his ...",Toy Story,21.946943,1995-10-30,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",John Lasseter
1,"Adventure, Fantasy, Family","boardgam, disappear, basedonchildren'sbook, ne...",When siblings Judy and Peter discover an encha...,Jumanji,17.015539,1995-12-15,"[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",Joe Johnston


In [6]:
df=df.dropna().reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4949 entries, 0 to 4948
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        4949 non-null   object 
 1   keywords      4949 non-null   object 
 2   overview      4949 non-null   object 
 3   title         4949 non-null   object 
 4   popularity    4949 non-null   float64
 5   release_date  4949 non-null   object 
 6   crew          4949 non-null   object 
 7   director      4949 non-null   object 
dtypes: float64(1), object(7)
memory usage: 309.4+ KB


In [7]:
df['combined']=df['genres']+' '+df['keywords']+' '+df['overview']
df.to_csv('temp.csv')
df.head(2)

Unnamed: 0,genres,keywords,overview,title,popularity,release_date,crew,director,combined
0,"Animation, Comedy, Family","jealousi, toy, boy, friendship, friend, rivalr...","Led by Woody, Andy's toys live happily in his ...",Toy Story,21.946943,1995-10-30,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",John Lasseter,"Animation, Comedy, Family jealousi, toy, boy, ..."
1,"Adventure, Fantasy, Family","boardgam, disappear, basedonchildren'sbook, ne...",When siblings Judy and Peter discover an encha...,Jumanji,17.015539,1995-12-15,"[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",Joe Johnston,"Adventure, Fantasy, Family boardgam, disappear..."


In [8]:
data=df[['title','combined','popularity','release_date','director']]
data.head(2)

Unnamed: 0,title,combined,popularity,release_date,director
0,Toy Story,"Animation, Comedy, Family jealousi, toy, boy, ...",21.946943,1995-10-30,John Lasseter
1,Jumanji,"Adventure, Fantasy, Family boardgam, disappear...",17.015539,1995-12-15,Joe Johnston


In [9]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
stop_words=set(stopwords.words('english'))

In [11]:
def preprocess_text(text):
    text=re.sub(r"[^a-zA-Z\s]","",text)
    text=text.lower()
    tokens=word_tokenize(text)
    token=[word for word in tokens if word not in stop_words]
    return " ".join(token)

In [12]:
data['cleaned_text']=data['combined'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cleaned_text']=data['combined'].apply(preprocess_text)


In [13]:
data.to_csv('filtered_data.csv', index=False)