In [78]:
# A basic mockup of our deep learning neural network model: looking at
# how reviews could determine the sscore of a tv show

In [79]:
# import dependencies
import pandas as pd
from nrclex import NRCLex

In [96]:
#Load CSV.. a text to columns process was already performed to separate the season field. 

file_to_load = 'selected_reviews.csv'

review_df = pd.read_csv(file_to_load)
review_df.head(2)

Unnamed: 0,title,season,rank,critic_meta_score,review_content,review_source,author,review_date,summary,meta_score,user_score,release_date,link,critics_reviews_link
0,Rectify,4.0,1,100.0,It allows us to know and care for these charac...,Collider,Allison Keene,27-Oct-16,Daniel tries to start a new life outside of Pa...,99,8.7,26-Oct-16,https://www.metacritic.com/tv/rectify/season-4,https://www.metacritic.com/tv/rectify/season-4...
1,Rectify,4.0,1,100.0,"Rectify, a drama entering its final season on ...",The New York Times,James Poniewozik,25-Oct-16,Daniel tries to start a new life outside of Pa...,99,8.7,26-Oct-16,https://www.metacritic.com/tv/rectify/season-4,https://www.metacritic.com/tv/rectify/season-4...


In [97]:
# Check how many unique shows are in the dataframe 
print(f'There are {len(review_df.title.unique())} different shows')

There are 2626 different shows


In [98]:
#find shows with multiple seasons

multiple_seasons = review_df.groupby('title').nunique()
multiple_seasons = multiple_seasons[multiple_seasons['season']>2]['season']

multiple_seasons.sort_values(ascending = False).head(10)

len(multiple_seasons)


113

In [99]:
#fill null values in season column with 0
#These programs are tv_specials and mini series. Perhaps the should be dropped.

review_df['season'].fillna(0, inplace = True)
review_df['title'][review_df['season']==0].unique()

 

array(['Bo Burnham: Inside', 'Planet Earth: Blue Planet II',
       'Homecoming: A Film by Beyoncé', 'The Underground Railroad',
       'Romeo & Juliet', "It's a Sin", 'A Parks and Recreation Special',
       'Longford', 'Kurt Cobain: Montage of Heck', 'Muhammad Ali',
       'Springsteen on Broadway', 'Deadwood: The Movie',
       'Elizabeth Is Missing', 'The Trip to Bountiful', 'Prohibition',
       'Sing Your Song',
       'The Black Church: This Is Our Story, This Is Our Song',
       'Hemingway', 'Between the World and Me',
       'Bright Lights: Starring Carrie Fisher and Debbie Reynolds',
       'The Normal Heart', 'Six by Sondheim',
       "What's My Name: Muhammad Ali", 'The Crime of the Century',
       'The Good Lord Bird', 'Elvis Presley: The Searcher',
       'Immigration Nation', 'The Dresser (2016)', 'Leaving Neverland',
       'Euphoria Special Episode Part 1: Rue',
       'No Direction Home: Bob Dylan', 'Exterminate All the Brutes',
       'Whoopi Goldberg Presents Moms

In [86]:
# Check how many null values are in the dataframe
review_df.isnull().sum()


title                      0
season                     0
rank                       0
critic_meta_score        205
review_content            27
review_source            205
author                   205
review_date             9956
summary                    0
meta_score                 0
user_score                 0
release_date               0
link                       0
critics_reviews_link       0
dtype: int64

In [101]:
# remove the null values and add it back to the dataframe 
review_df=review_df[review_df.review_content.notnull()]
review_df=review_df[review_df.critic_meta_score.notnull()]

review_df.isnull().sum()

title                      0
season                     0
rank                       0
critic_meta_score          0
review_content             0
review_source              0
author                     0
review_date             9751
summary                    0
meta_score                 0
user_score                 0
release_date               0
link                       0
critics_reviews_link       0
dtype: int64

In [102]:
reviews = review_df['review_content']
reviews = reviews.dropna()
reviews.isnull().sum()

review_df['reviews_new'] = reviews

In [103]:
sample_size = 2000
sample_df = review_df.head(sample_size)
reviews =review_df['review_content'][0:sample_size].tolist()
print(len(reviews))
print(len(sample_df))

2000
2000


In [104]:
emotions_scores = []

In [105]:
#loop to genearate a dataframe of emotion scores of reviews

for review in reviews:
    text_object = NRCLex(review)
    data = text_object.raw_emotion_scores
    af = text_object.affect_frequencies
    
    emotions_scores.append(af)


In [106]:
# convert emotion list into a dataframe
emotions_df = pd.DataFrame(emotions_scores)
emotions_df

Unnamed: 0,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,anticipation
0,0.083333,0.000000,0.0,0.083333,0.083333,0.250000,0.083333,0.166667,0.000000,0.083333,0.166667
1,0.285714,0.142857,0.0,0.000000,0.000000,0.142857,0.142857,0.285714,0.000000,0.000000,
2,0.041667,0.000000,0.0,0.208333,0.041667,0.208333,0.041667,0.083333,0.041667,0.166667,0.166667
3,0.000000,0.066667,0.0,0.200000,0.000000,0.133333,0.200000,0.066667,0.133333,0.133333,0.066667
4,0.000000,0.000000,0.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.000000,0.0,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000
1996,0.000000,0.333333,0.0,0.000000,0.000000,0.000000,0.333333,0.333333,0.000000,0.000000,
1997,0.000000,0.000000,0.0,0.363636,0.000000,0.545455,0.090909,0.000000,0.000000,0.000000,
1998,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,


In [107]:
emotions_df.drop(columns=['anticipation','anticip'], axis=1 , inplace = True)

In [108]:
emotions_df['sum'] = emotions_df['fear'] + emotions_df['anger'] + emotions_df['trust'] + emotions_df['surprise'] +emotions_df['positive'] + emotions_df['negative'] + emotions_df['disgust'] + emotions_df['joy']


In [109]:
emotions_df.describe()

Unnamed: 0,fear,anger,trust,surprise,positive,negative,sadness,disgust,joy,sum
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.060422,0.037937,0.18532,0.048614,0.257045,0.104474,0.047217,0.026155,0.078182,0.798148
std,0.093059,0.073037,0.200261,0.086974,0.215959,0.14015,0.088773,0.055334,0.094861,0.250263
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.75
50%,0.0,0.0,0.153846,0.0,0.230769,0.071429,0.0,0.0,0.052632,0.846154
75%,0.111111,0.066667,0.25,0.083333,0.333333,0.166667,0.083333,0.0,0.142857,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0


In [110]:
emotions_df = emotions_df[emotions_df['sum']>=.25]


In [111]:
# merge the sampled datframe to the emotions dataframe
df_merged = sample_df.merge( emotions_df, left_index = True, right_index = True)

df_merged.head()

Unnamed: 0,title,season,rank,critic_meta_score,review_content,review_source,author,review_date,summary,meta_score,...,fear,anger,trust,surprise,positive,negative,sadness,disgust,joy,sum
0,Rectify,4.0,1,100.0,It allows us to know and care for these charac...,Collider,Allison Keene,27-Oct-16,Daniel tries to start a new life outside of Pa...,99,...,0.083333,0.0,0.083333,0.083333,0.25,0.083333,0.166667,0.0,0.083333,0.666667
1,Rectify,4.0,1,100.0,"Rectify, a drama entering its final season on ...",The New York Times,James Poniewozik,25-Oct-16,Daniel tries to start a new life outside of Pa...,99,...,0.285714,0.142857,0.0,0.0,0.142857,0.142857,0.285714,0.0,0.0,0.714286
2,Rectify,4.0,1,100.0,No other series so poignantly probes the human...,Salon,Melanie McFarland,26-Oct-16,Daniel tries to start a new life outside of Pa...,99,...,0.041667,0.0,0.208333,0.041667,0.208333,0.041667,0.083333,0.041667,0.166667,0.75
3,Rectify,4.0,1,100.0,None of these characters is particularly happy...,Yahoo TV,Ken Tucker,26-Oct-16,Daniel tries to start a new life outside of Pa...,99,...,0.0,0.066667,0.2,0.0,0.133333,0.2,0.066667,0.133333,0.133333,0.866667
4,Rectify,4.0,1,100.0,Rectify is the best series I have ever seen on...,The Daily Beast,Malcolm Jones,26-Oct-16,Daniel tries to start a new life outside of Pa...,99,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [112]:
df_merged.columns

Index(['title', 'season', 'rank', 'critic_meta_score', 'review_content',
       'review_source', 'author', 'review_date', 'summary', 'meta_score',
       'user_score', 'release_date', 'link', 'critics_reviews_link',
       'reviews_new', 'fear', 'anger', 'trust', 'surprise', 'positive',
       'negative', 'sadness', 'disgust', 'joy', 'sum'],
      dtype='object')

In [118]:
x_list = ['fear','anger', 'trust', 'surprise', 'positive', 'negative', 'sadness',
       'disgust', 'joy']
y_list = ['critic_meta_score']

x = df_merged[x_list]
x.index.rename('index', inplace = True)

y = df_merged[y_list]
y.index.rename('index', inplace = True)


In [119]:
y

Unnamed: 0_level_0,critic_meta_score
index,Unnamed: 1_level_1
0,100.0
1,100.0
2,100.0
3,100.0
4,100.0
...,...
1995,70.0
1996,80.0
1997,90.0
1998,50.0


In [120]:
df_merged.to_csv('tv_emotions_data.csv')

x.to_csv('emotion_scores.csv')
y.to_csv('critic_meta_score.csv')
