**Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Importing Scraped Movies Data**

In [None]:
movies_data=pd.read_csv(r"/content/movies_final.csv")
movies_data.info()
df = movies_data[['movie','genres','director','stars','plot']]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86655 entries, 0 to 86654
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     86655 non-null  int64  
 1   movie          86655 non-null  object 
 2   year           86655 non-null  int64  
 3   time_minute    86655 non-null  object 
 4   imdb_rating    86655 non-null  float64
 5   genres         86655 non-null  object 
 6   plot           86655 non-null  object 
 7   director       86655 non-null  object 
 8   stars          86655 non-null  object 
 9   primary_genre  86655 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 6.6+ MB


**Text Pre-Processing**

In [None]:
df['genres'] = df['genres'].map(lambda x: x.lower().split(','))
split_df = pd.DataFrame(df['genres'].tolist(), columns=['genre1', 'genre2', 'genre3'])
df = pd.concat([df, split_df], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df['stars'] = df['stars'].map(lambda x: x.lower().split(','))
split_df = pd.DataFrame(df['stars'].tolist(), columns=['star1', 'star2', 'star3','star4','star5'])
df = pd.concat([df, split_df], axis=1)

In [None]:
# changing case
df['plot'] = df['plot'].map(lambda x: x.lower())
df['movie']=df['movie'].map(lambda x: x.lower())

In [None]:
#merging together first and last name for each actor 
for index, row in df.iterrows():
    if row['star1'] is not None:
     row['star1'] = row['star1'].lower().replace(' ','')
    if row['star2'] is not None:
     row['star2'] = row['star2'].lower().replace(' ','')
    if row['star3'] is not None:
     row['star3'] = row['star3'].lower().replace(' ','')
    if row['star4'] is not None:
     row['star4'] = row['star4'].lower().replace(' ','')
    if row['star5'] is not None:
     row['star5'] = row['star5'].lower().replace(' ','')

In [None]:
df=df.drop(['genres','stars'],axis=1)

In [None]:
df=df.drop_duplicates(subset=['movie'])

In [None]:
# setting movie name as index
df.set_index('movie', inplace = True) 

In [None]:
#seperating each word from each row and column and merging as list
bag_of_words=[]
for index, row in df.iterrows():
    words = []
    for col in df.columns:
            word=nltk.word_tokenize(str(row[col]))
            words.extend(word)
    bag_of_words.append(words)

In [None]:
#Adding all keywords as column to dataframe
df['bag_of_words']=bag_of_words  

In [None]:
#Dropping old columns
df1=df.drop(['director', 'plot', 'genre1', 'genre2', 'genre3', 'star1', 'star2',
       'star3', 'star4','star5'],axis=1)

In [None]:
ignore_letters=[',','.','[',']','!','\'s']

In [None]:
for index, row in df1.iterrows():
   for col in df1.columns:
       row[col]=[lemmatizer.lemmatize(w.lower()) for w in row[col] if w not in ignore_letters]

In [None]:
#making corpus for each movie
for index, row in df1.iterrows():
   for col in df1.columns:
       str1=" "
       str1=str1.join(row[col])
       row[col]=str1

In [None]:
#sample for first movie
df1.iloc[0:1]

Unnamed: 0_level_0,bag_of_words
movie,Unnamed: 1_level_1
shang-chi and the legend of the ten rings,destin daniel cretton shang-chi the master of ...


In [None]:
df2=df1.iloc[0:15000]

**Text Vectorization**

In [None]:
count = CountVectorizer()
count_matrix = count.fit_transform(df2['bag_of_words'])
indices = pd.Series(df2.index)

In [None]:
# now storing vector matrix and movie titles as pickle
import pickle
pickle.dump(count_matrix, open("count_matrix.pickel", "wb"))
pickle.dump(indices, open("indices.pickel", "wb"))