In [13]:
# Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

In [14]:
1. # get data
data_path = '../data'
rt_movies_oscars = pd.read_csv(os.path.join(data_path, 'rotten_tomatoes_movies_with_oscars.csv'))
rt_critic = pd.read_csv(os.path.join(data_path, 'rotten_tomatoes_critic_reviews.csv'))

In [15]:
2. # Clean Data Types

# Set dates as datetime
rt_movies_oscars.original_release_date = pd.to_datetime(rt_movies_oscars.original_release_date)
rt_movies_oscars.streaming_release_date = pd.to_datetime(rt_movies_oscars.streaming_release_date)
rt_movies_oscars['original_release_quarter'] = [pd.Timestamp(x).quarter for x in rt_movies_oscars.original_release_date]
rt_movies_oscars['streaming_release_quarter'] = [pd.Timestamp(x).quarter for x in rt_movies_oscars.streaming_release_date]
rt_movies_oscars['original_release_month'] = [pd.Timestamp(x).month for x in rt_movies_oscars.original_release_date]
rt_movies_oscars['streaming_release_month'] = [pd.Timestamp(x).month for x in rt_movies_oscars.streaming_release_date]

# Assign binary for oscar nom as binary
rt_movies_oscars['oscar_nomination'] = [0 if x == 'FALSE' else 1 
                                        for x in rt_movies_oscars.oscar_nomination]

# Make tomatometer status a binary (0=R or 1=F)
rt_movies_oscars['tomatometer_status'] = [1 if 'Fresh' in str(x) else 0 if 'Rotten' 
                                          in str(x) else np.nan for x 
                                          in rt_movies_oscars.tomatometer_status]

# Encode the Content rating 
content_rating_categories = rt_movies_oscars['content_rating'].unique()
print(np.unique(rt_movies_oscars['content_rating'], return_counts=True))
content_beforeEncode = np.unique(rt_movies_oscars['content_rating'])

# Transform categorical content rating into numeric
rt_movies_oscars['content_rating'] = LabelEncoder().fit_transform(rt_movies_oscars['content_rating'])
print(np.unique(rt_movies_oscars['content_rating'], return_counts=True))
content_afterEncode = np.unique(rt_movies_oscars['content_rating'])

content_map = pd.DataFrame({'orig': content_beforeEncode, 'encoded': content_afterEncode})
display(content_map)

# filter for dates after 1998
rt_movies_oscars = rt_movies_oscars[rt_movies_oscars.original_release_date >= pd.to_datetime('01-01-1998')]

# filter for columns we need for feature generation
selected_cols = ['rotten_tomatoes_link', 'movie_title', 'movie_info', 'runtime',
       'critics_consensus', 'content_rating', 'genres', 'original_release_date', 'streaming_release_date',
       'production_company', 'tomatometer_status', 'tomatometer_rating',
       'tomatometer_count', 'audience_status', 'audience_rating',
       'audience_count', 'tomatometer_top_critics_count',
       'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count',
       'oscar_nomination', 'original_release_quarter', 'streaming_release_quarter',
       'original_release_month', 'streaming_release_month']

filtered_df = rt_movies_oscars.loc[:,selected_cols]
filtered_df = filtered_df.dropna(axis = 0) # Remove rows with NA values
display(filtered_df.head())

(array(['G', 'NC17', 'NR', 'PG', 'PG-13', 'R'], dtype=object), array([ 676,   38, 5474, 2168, 2979, 6377], dtype=int64))
(array([0, 1, 2, 3, 4, 5]), array([ 676,   38, 5474, 2168, 2979, 6377], dtype=int64))


Unnamed: 0,orig,encoded
0,G,0
1,NC17,1
2,NR,2
3,PG,3
4,PG-13,4
5,R,5


Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,runtime,critics_consensus,content_rating,genres,original_release_date,streaming_release_date,production_company,...,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,oscar_nomination,original_release_quarter,streaming_release_quarter,original_release_month,streaming_release_month
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",119.0,Though it may seem like just another Harry Pot...,3,"Action & Adventure, Comedy, Drama, Science Fic...",2010-02-12,2015-11-25,20th Century Fox,...,53.0,254421.0,43,73,76,0,1.0,4.0,2.0,11.0
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,90.0,Nicole Holofcener's newest might seem slight i...,5,Comedy,2010-04-30,2012-09-04,Sony Pictures Classics,...,64.0,11574.0,44,123,19,0,2.0,3.0,4.0,9.0
5,m/10000_bc,"10,000 B.C.",Mammoth hunter D'Leh (Steven Strait) has long ...,109.0,With attention strictly paid to style instead ...,4,"Action & Adventure, Classics, Drama",2008-03-07,2013-06-22,Warner Bros. Pictures,...,37.0,411140.0,37,12,137,0,1.0,2.0,3.0,6.0
12,m/10002516-lost_city,The Lost City,"Fico Fellove (Andy Garcia), an apolitical Hava...",143.0,"Its heart is in the right place, but what star...",5,Drama,2005-09-03,2017-03-23,Magnolia Pictures,...,64.0,25944.0,37,21,62,0,3.0,1.0,9.0,3.0
15,m/10002635-bridge_of_san_luis_rey,The Bridge of San Luis Rey,"During the Spanish Inquisition, Franciscan mon...",124.0,Despite an all-star cast and some impressive v...,3,"Art House & International, Drama, Faith & Spir...",2005-06-10,2014-07-24,Fine Line Features,...,35.0,1935.0,13,1,23,0,2.0,3.0,6.0,7.0


In [16]:
# Features engineering pt1 
##########################

# Genre encoding based on # of movies with that genre
filtered_df['genres_list'] = [str(x).split(', ') if str(x) != 'nan' else np.nan
                              for x in tqdm(filtered_df.genres)]
all_genres = np.concatenate(filtered_df.genres_list.values)

genres_weights = pd.DataFrame(pd.DataFrame(all_genres).value_counts(), columns=['weights']).reset_index()
genres_weights.columns = ['genres', 'weights']

genres_sums = []
for index, row in tqdm(filtered_df.iterrows()):
    row_vals = []
    #print(row['actors_list'])
    for i in range(len(row['genres_list'])):
        #print(pt2_eda_dat['actors_list'].iloc[10][i])

        row_vals.append(int(genres_weights[genres_weights.genres ==
                                      row['genres_list'][i]].weights.values))
    genres_sums.append(sum(row_vals))
    #print(row_vals)

filtered_df['genres_sums'] = genres_sums

# Same for production company
filtered_df['prodComp_list'] = [str(x).split(', ') if str(x) != 'nan' else 
                                np.nan for x in tqdm(filtered_df.production_company)]
all_prodComp = np.concatenate(filtered_df.prodComp_list.values)

prodComp_weights = pd.DataFrame(pd.DataFrame(all_prodComp).value_counts(), columns=['weights']).reset_index()
prodComp_weights.columns = ['prodComp', 'weights']

prodComp_sums = []
for index, row in tqdm(filtered_df.iterrows()):
    row_vals = []
    #print(row['actors_list'])
    for i in range(len(row['prodComp_list'])):
        #print(pt2_eda_dat['actors_list'].iloc[10][i])

        row_vals.append(int(prodComp_weights[prodComp_weights.prodComp ==
                                      row['prodComp_list'][i]].weights.values))
    prodComp_sums.append(sum(row_vals))
    #print(row_vals)

filtered_df['prodComp_sums'] = prodComp_sums

# Create a column that is the difference between runtime and 100 minutes
filtered_df['runtime_difference_from_average'] = abs(filtered_df['runtime'] - 100)


100%|██████████| 6650/6650 [00:00<00:00, 1107577.40it/s]
6650it [00:03, 1890.43it/s]
100%|██████████| 6650/6650 [00:00<00:00, 72216.91it/s]
6650it [00:02, 3017.22it/s]


In [17]:
# Bring in actor director features

actorDirec_df = pd.read_csv(os.path.join(data_path, 'actor_director_weights_wTitle_v2.csv'))
actorDirec_df.columns
#merge actor director features with filtered_df -> only returns the union of rotten_tomatoes_link
filtered_df = filtered_df.merge(actorDirec_df.loc[:, ['rotten_tomatoes_link', 'directors_counts', 'actors_counts',
       'actors_value', 'directors_value']], on = 'rotten_tomatoes_link') 

#create features for title length and critic consensus length
filtered_df['title_length'] = [len(x) for x in filtered_df.movie_title]
filtered_df['critic_consensus_length'] = [len(x) for x in filtered_df.critics_consensus]
print(filtered_df.columns)

Index(['rotten_tomatoes_link', 'movie_title', 'movie_info', 'runtime',
       'critics_consensus', 'content_rating', 'genres',
       'original_release_date', 'streaming_release_date', 'production_company',
       'tomatometer_status', 'tomatometer_rating', 'tomatometer_count',
       'audience_status', 'audience_rating', 'audience_count',
       'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count',
       'tomatometer_rotten_critics_count', 'oscar_nomination',
       'original_release_quarter', 'streaming_release_quarter',
       'original_release_month', 'streaming_release_month', 'genres_list',
       'genres_sums', 'prodComp_list', 'prodComp_sums',
       'runtime_difference_from_average', 'directors_counts', 'actors_counts',
       'actors_value', 'directors_value', 'title_length',
       'critic_consensus_length'],
      dtype='object')


In [18]:
#add genre features variable to dataset
filtered_df.genres = filtered_df.genres.astype(str, errors = 'ignore')

features_arr = []
i = 0
while i < len(filtered_df['genres']):
    split = filtered_df['genres'][i].split(',')
    line_arr = []
    
    for item in split:
        line_arr.append(''.join( map(str.lower, item.split() )  ))
    features_arr.append(line_arr)
    i = i +1

mlb = MultiLabelBinarizer()
features_column = mlb.fit_transform(features_arr)

In [19]:
filtered_df[['action&adventure','animation', 'anime&manga', 'arthouse&international', 'classics', 'comedy', 'cultmovies',
             'documentary','drama', 'faith&spirituality', 'gay&lesbian', 'horror', 'kids&family', 'musical&performingarts',
             'mystery&suspense','romance', 'sciencefiction&fantasy', 'specialinterest',
             'sports&fitness', 'television','western']] = features_column

In [20]:
#Features Table: 

features = filtered_df.loc[:, ['rotten_tomatoes_link', 'movie_title', 'movie_info', 'runtime', 'runtime_difference_from_average',
       'critics_consensus', 'content_rating',
       'original_release_date', 'streaming_release_date', 'original_release_quarter', 'streaming_release_quarter',
       'original_release_month', 'streaming_release_month', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count',
       'audience_status', 'audience_rating', 'audience_count',
       'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count',
       'tomatometer_rotten_critics_count', 'oscar_nomination',
       'genres_sums', 'prodComp_sums', 'title_length', 'critic_consensus_length',
       'directors_counts', 'actors_counts', 'actors_value', 'directors_value', 'action&adventure','animation', 'anime&manga', 'arthouse&international', 'classics', 'comedy', 'cultmovies',
        'documentary','drama', 'faith&spirituality', 'gay&lesbian', 'horror', 'kids&family', 'musical&performingarts',
        'mystery&suspense','romance', 'sciencefiction&fantasy', 'specialinterest',
        'sports&fitness', 'television','western'
        ]]
display(features.head())
print('features shape: ', features.shape)

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,runtime,runtime_difference_from_average,critics_consensus,content_rating,original_release_date,streaming_release_date,original_release_quarter,...,horror,kids&family,musical&performingarts,mystery&suspense,romance,sciencefiction&fantasy,specialinterest,sports&fitness,television,western
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",119.0,19.0,Though it may seem like just another Harry Pot...,3,2010-02-12,2015-11-25,1.0,...,0,0,0,0,0,1,0,0,0,0
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,90.0,10.0,Nicole Holofcener's newest might seem slight i...,5,2010-04-30,2012-09-04,2.0,...,0,0,0,0,0,0,0,0,0,0
2,m/10000_bc,"10,000 B.C.",Mammoth hunter D'Leh (Steven Strait) has long ...,109.0,9.0,With attention strictly paid to style instead ...,4,2008-03-07,2013-06-22,1.0,...,0,0,0,0,0,0,0,0,0,0
3,m/10002516-lost_city,The Lost City,"Fico Fellove (Andy Garcia), an apolitical Hava...",143.0,43.0,"Its heart is in the right place, but what star...",5,2005-09-03,2017-03-23,3.0,...,0,0,0,0,0,0,0,0,0,0
4,m/10002635-bridge_of_san_luis_rey,The Bridge of San Luis Rey,"During the Spanish Inquisition, Franciscan mon...",124.0,24.0,Despite an all-star cast and some impressive v...,3,2005-06-10,2014-07-24,2.0,...,0,0,0,0,0,0,0,0,0,0


features shape:  (6510, 52)


In [22]:
features.to_csv(os.path.join(data_path, 'features.csv'), index = False)