In [2]:
import numpy as np
import pandas as pd

from gensim.parsing.preprocessing import remove_stopwords



In [3]:
# loading datasets and preliminary cleaning
rotten=pd.read_csv('../data/rotten_tomatoes_movies.csv')
rotten=rotten.drop(['rotten_tomatoes_link', 'content_rating', 'actors', 'streaming_release_date', 
                    'movie_info', 'critics_consensus', 'genres', 'directors', 'authors', 'original_release_date',  
                    'production_company', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'], axis=1)


meta=pd.read_csv('../data/movies_meta.csv')
meta=meta.loc[meta['original_language']=='en']  # only english movies
meta=meta.drop(['production_countries', 'overview', 'tagline', 'belongs_to_collection', 'homepage', 'revenue', 'spoken_languages', 'video', 'homepage', "poster_path", 'production_companies'], axis=1)
meta['release_date']=pd.to_datetime(meta['release_date'])


In [4]:


#clean titles

def clean_title(x):
    '''
    method to normalize the movie titles
    '''
    for char in [",", "'", ".", ":", ";", "(", ")", "/", "!", "?", "%", "-", "_", "="]:
        x=x.replace(char, "")        
    return x.lower()
 
rotten['movie_title']=rotten['movie_title'].apply(clean_title)
meta['title']=meta['title'].apply(clean_title)


# column for drama (true or false)

def drama(df):
    drama=[]
    df=df.reset_index()
    for row in df['genres']:
        if "Drama" in row:
            drama.append(True)
        else:
            drama.append(False)
            
    print(len(drama))
    df['drama']=pd.Series(drama)
    return df

meta_drama=drama(meta)
meta_drama=meta_drama.loc[meta_drama['drama']==True]
print(meta_drama)


# clean genres

def clean_genres(x):
    '''
    Method to clean the genres variable in meta dataset
    '''
    for char in ["}", "'", ",", ']']:
        x=x.replace(char, "")     
    x=list(x.split())
    genres=[]
    for i, word in enumerate(x):

        if (i+1)%4==0:
            genres.append(word)
    return (genres)
   
        
meta['genres']=meta['genres'].apply(clean_genres)

3608
      index  adult    budget  \
3         3  False   3600000   
5         5  False         0   
6         6  False  11000000   
8         8  False  12000000   
11       11  False  55000000   
...     ...    ...       ...   
3591   4660  False         0   
3599   4677  False         0   
3600   4678  False         0   
3603   4684  False         0   
3606   4688  False         0   

                                                 genres      id    imdb_id  \
3     [{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...     451  tt0113627   
5     [{'id': 36, 'name': 'History'}, {'id': 18, 'na...   47018  tt0112637   
6                         [{'id': 18, 'name': 'Drama'}]     687  tt0112818   
8     [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...    9603  tt0112697   
11    [{'id': 12, 'name': 'Adventure'}, {'id': 16, '...   10530  tt0114148   
...                                                 ...     ...        ...   
3591  [{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...   26792 

In [4]:
def combine(df1, df2):
    result = pd.merge(df1, df2, how="inner", left_on='movie_title', right_on='title')
    result = result.drop(['movie_title'], axis=1)
    return result

combined=combine(rotten, meta)
print(combined.columns)

Index(['runtime_x', 'tomatometer_status', 'tomatometer_rating',
       'tomatometer_count', 'audience_status', 'audience_rating',
       'audience_count', 'tomatometer_top_critics_count', 'adult', 'budget',
       'genres', 'id', 'imdb_id', 'original_language', 'original_title',
       'popularity', 'release_date', 'runtime_y', 'status', 'title',
       'vote_average', 'vote_count'],
      dtype='object')
