In [1]:
import numpy as np
import pandas as pd
from gensim.parsing.preprocessing import remove_stopwords

In [38]:
# loading datasets and preliminary cleaning
rotten=pd.read_csv('../data/rotten_tomatoes_movies.csv')
rotten=rotten.drop(['rotten_tomatoes_link', 'content_rating', 'actors', 'streaming_release_date', 
                    'movie_info', 'critics_consensus', 'genres', 'directors', 'authors',# 'original_release_date',  
                    'production_company', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'], axis=1)


meta=pd.read_csv('../data/movies_meta.csv')
meta=meta.loc[meta['original_language']=='en']  # only english movies
meta=meta.drop(['production_countries', 'overview', 'tagline', 'belongs_to_collection', 'homepage', 'revenue', 'spoken_languages', 'video', 'homepage', "poster_path", 'production_companies'], axis=1)
#meta['release_date']=pd.to_datetime(meta['release_date'])
print(rotten)

                                             movie_title  \
0      Percy Jackson & the Olympians: The Lightning T...   
1                                            Please Give   
2                                                     10   
3                        12 Angry Men (Twelve Angry Men)   
4                           20,000 Leagues Under The Sea   
...                                                  ...   
17707                                          Zoot Suit   
17708                                           Zootopia   
17709                                    Zorba the Greek   
17710                                               Zulu   
17711                                          Zulu Dawn   

      original_release_date  runtime tomatometer_status  tomatometer_rating  \
0                2010-02-12    119.0             Rotten                49.0   
1                2010-04-30     90.0    Certified-Fresh                87.0   
2                1979-10-05    122.0      

In [39]:
# column for drama (true or false)

def drama(df):
    drama=[]
    df=df.reset_index()
    for row in df['genres']:
        if "Drama" in row:
            drama.append(True)
        else:
            drama.append(False)
            
    df=df.drop('index', axis=1)
    df['drama']=pd.Series(drama)
    return df

meta_drama=drama(meta)
meta_drama=meta_drama.loc[meta_drama['drama']==True]
print(meta_drama)

      adult    budget                                             genres  \
3     False   3600000  [{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...   
5     False         0  [{'id': 36, 'name': 'History'}, {'id': 18, 'na...   
6     False  11000000                      [{'id': 18, 'name': 'Drama'}]   
8     False  12000000  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
11    False  55000000  [{'id': 12, 'name': 'Adventure'}, {'id': 16, '...   
...     ...       ...                                                ...   
3591  False         0  [{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...   
3599  False         0  [{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...   
3600  False         0  [{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...   
3603  False         0                      [{'id': 18, 'name': 'Drama'}]   
3606  False         0  [{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...   

          id    imdb_id original_language     original_title  popularity  \
3        45

In [40]:
meta_drama=meta_drama.loc[meta_drama['release_date']> '2005-01-01']
print(meta_drama)

      adult    budget                                             genres  \
490   False         0  [{'id': 10749, 'name': 'Romance'}, {'id': 18, ...   
1250  False  29000000  [{'id': 18, 'name': 'Drama'}, {'id': 9648, 'na...   
1282  False  20000000  [{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...   
1290  False  14000000  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
1296  False  56000000  [{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...   
...     ...       ...                                                ...   
3588  False         0  [{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...   
3591  False         0  [{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...   
3599  False         0  [{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...   
3600  False         0  [{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...   
3603  False         0                      [{'id': 18, 'name': 'Drama'}]   

          id    imdb_id original_language         original_title  popularity  \
490   1

In [41]:
#clean titles

def clean_title(x):
    '''
    Method to normalize the movie titles
    '''
    for char in [",", "'", ".", ":", ";", "(", ")", "/", "!", "?", "%", "-", "_", "="]:
        x=x.replace(char, "")        
    return x.lower()
 
rotten['movie_title']=rotten['movie_title'].apply(clean_title)
meta_drama['title']=meta_drama['title'].apply(clean_title)


'''
# clean genres

def clean_genres(x):
  
    Method to clean the genres variable in meta dataset
    
    for char in ["}", "'", ",", ']']:
        x=x.replace(char, "")     
    x=list(x.split())
    genres=[]
    for i, word in enumerate(x):

        if (i+1)%4==0:
            genres.append(word)
    return (genres)
   
#meta['genres']=meta['genres'].apply(clean_genres)
meta_drama['genres']=meta_drama['genres'].apply(clean_genres)

'''

'\n# clean genres\n\ndef clean_genres(x):\n  \n    Method to clean the genres variable in meta dataset\n    \n    for char in ["}", "\'", ",", \']\']:\n        x=x.replace(char, "")     \n    x=list(x.split())\n    genres=[]\n    for i, word in enumerate(x):\n\n        if (i+1)%4==0:\n            genres.append(word)\n    return (genres)\n   \n#meta[\'genres\']=meta[\'genres\'].apply(clean_genres)\nmeta_drama[\'genres\']=meta_drama[\'genres\'].apply(clean_genres)\n\n'

In [42]:
def combine(df1, df2):
    result = pd.merge(df1, df2, how="inner", left_on='movie_title', right_on='title')
    result = result.drop(['movie_title'], axis=1)
    return result

combined=combine(rotten, meta_drama)
print(combined.columns)

Index(['original_release_date', 'runtime_x', 'tomatometer_status',
       'tomatometer_rating', 'tomatometer_count', 'audience_status',
       'audience_rating', 'audience_count', 'tomatometer_top_critics_count',
       'adult', 'budget', 'genres', 'id', 'imdb_id', 'original_language',
       'original_title', 'popularity', 'release_date', 'runtime_y', 'status',
       'title', 'vote_average', 'vote_count', 'drama'],
      dtype='object')


In [43]:
print(combined)

    original_release_date  runtime_x tomatometer_status  tomatometer_rating  \
0              2007-01-27       90.0             Rotten                55.0   
1              2018-06-22       79.0             Rotten                40.0   
2              2009-05-08      108.0             Rotten                25.0   
3              2007-10-26      101.0             Rotten                34.0   
4              2008-09-08       96.0             Rotten                32.0   
..                    ...        ...                ...                 ...   
569            2014-10-10      102.0             Rotten                48.0   
570            2017-02-03       99.0             Rotten                45.0   
571            2007-12-14      125.0             Rotten                32.0   
572            2013-09-20      107.0             Rotten                46.0   
573            2011-01-19       93.0             Rotten                55.0   

     tomatometer_count audience_status  audience_ra

In [44]:
combined.to_csv("meta_rotten_drama.csv")