In [45]:
import string
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
"""
use an n-gram vectorizer to collect significant words and phrases
from the keywords column.

"""

data = pd.read_csv('movie_metadata.csv', dtype = str)

#blank entries replaced with space to avoid errors
corpus = data['plot_keywords'].fillna(" ")


st = PorterStemmer()
corpus = corpus.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))#stem words

""" tokenize all words 3 letters or longer.  
    Ignore 1-grams that occur in more than 60% 
    or in less than 2% of descriptions
"""
vectorizer = CountVectorizer(stop_words = 'english', strip_accents=ascii, analyzer = 'word',max_df=0.75, min_df=.01)
bow = np.array(vectorizer.fit_transform(corpus).toarray())

In [54]:
"""
append plot_keywords document term matrix to the original data set
"""
keywords_dtm = pd.DataFrame(bow, columns=vectorizer.get_feature_names())
print(keywords_dtm.head(20))
data_with_keywords = pd.concat([data, keywords_dtm], axis=1)

    actor  agent  alien  american  base  battle  blood  book  box  boy  ...   \
0       0      0      0         0     0       0      0     0    0    0  ...    
1       0      0      0         0     0       0      0     0    0    0  ...    
2       0      0      0         0     0       0      0     0    0    0  ...    
3       0      0      0         0     0       0      0     0    0    0  ...    
4       0      0      0         0     0       0      0     0    0    0  ...    
5       0      0      1         1     0       0      0     0    0    0  ...    
6       0      0      0         0     0       0      0     0    0    0  ...    
7       0      0      0         0     0       0      0     0    0    0  ...    
8       0      0      0         0     0       0      0     1    0    0  ...    
9       0      0      0         0     0       0      1     1    0    0  ...    
10      0      0      0         0     1       0      0     1    0    0  ...    
11      0      0      0         0     0 

In [48]:
"""extracting individual genre tags from genre column"""

#no blanks in genres column, no need to fill.na

genres_list= data["genres"].str.split("|", expand = False)
genre_set = set(x for l in genres_list for x in l)
index = range(len(genres_list))
genre_df = pd.DataFrame(index = index,columns = genre_set)
genre_df = genre_df.fillna(0)

#creating term matrix for genres
index = 0
for l in genres_list:
    for g in l:
        genre_df.at[index,g] = 1
    index= index +1
  
#append
final_data = pd.concat([data_with_keywords,genre_df],axis=1)

print(final_data.head(5))

   color      director_name num_critic_for_reviews duration  \
0  Color      James Cameron                    723      178   
1  Color     Gore Verbinski                    302      169   
2  Color         Sam Mendes                    602      148   
3  Color  Christopher Nolan                    813      164   
4    NaN        Doug Walker                    NaN      NaN   

  director_facebook_likes actor_3_facebook_likes      actor_2_name  \
0                       0                    855  Joel David Moore   
1                     563                   1000     Orlando Bloom   
2                       0                    161      Rory Kinnear   
3                   22000                  23000    Christian Bale   
4                     131                    NaN        Rob Walker   

  actor_1_facebook_likes      gross                           genres   ...    \
0                   1000  760505847  Action|Adventure|Fantasy|Sci-Fi   ...     
1                  40000  309404152     

In [49]:
#export new data to csv
final_data.to_csv("cleaned_movie_data.csv", header= True)