# Applying NRCLex to television shows reviews

In [2]:
# import dependencies
import pandas as pd
from nrclex import NRCLex

In [3]:
# Load CSV.. a text to columns process was already performed to separate the season field. 

file_to_load = 'selected_reviews.csv'

review_df = pd.read_csv(file_to_load)
review_df.head(2)

Unnamed: 0,title,season,rank,critic_meta_score,review_content,review_source,author,review_date,summary,meta_score,user_score,release_date,link,critics_reviews_link
0,Rectify,4.0,1,100.0,It allows us to know and care for these charac...,Collider,Allison Keene,27-Oct-16,Daniel tries to start a new life outside of Pa...,99,8.7,26-Oct-16,https://www.metacritic.com/tv/rectify/season-4,https://www.metacritic.com/tv/rectify/season-4...
1,Rectify,4.0,1,100.0,"Rectify, a drama entering its final season on ...",The New York Times,James Poniewozik,25-Oct-16,Daniel tries to start a new life outside of Pa...,99,8.7,26-Oct-16,https://www.metacritic.com/tv/rectify/season-4,https://www.metacritic.com/tv/rectify/season-4...


In [6]:
# Add a column for character count to dataset

review_lengths = []

for rev in review_df['review_content']:
    leng = len(str(rev))
    review_lengths.append(leng)

review_lengths

review_df['review_length'] = review_lengths

# Filter dataframe for reviews of 300 characters of more

review_df = review_df[review_df.review_length >= 300]



In [7]:
review_df.describe()

Unnamed: 0,season,rank,critic_meta_score,meta_score,review_length
count,8821.0,9620.0,9599.0,9620.0,9620.0
mean,1.442977,1378.206445,67.561829,68.320374,378.342931
std,1.360639,847.621272,19.199632,13.14439,65.757924
min,1.0,1.0,0.0,15.0,300.0
25%,1.0,637.75,50.0,60.0,327.0
50%,1.0,1333.0,70.0,69.0,362.0
75%,1.0,2068.0,80.0,78.0,414.0
max,21.0,3138.0,100.0,99.0,793.0


In [8]:
# Check how many null values are in the dataframe
review_df.isnull().sum()


title                     0
season                  799
rank                      0
critic_meta_score        21
review_content            0
review_source            21
author                   21
review_date             496
summary                   0
meta_score                0
user_score                0
release_date              0
link                      0
critics_reviews_link      0
review_length             0
dtype: int64

In [9]:
# Remove the null values and add it back to the dataframe 
review_df = review_df[review_df.review_content.notnull()]
review_df = review_df[review_df.critic_meta_score.notnull()]
review_df = review_df[review_df.season.notnull()]

In [11]:
# Isolate list reviews_content column into a list
reviews =review_df['review_content'].tolist()

In [13]:
# Declare a list to store emotion scores of reviews 
emotions_scores = []

# Use a loop to genearate a dataframe of emotion scores of reviews

for review in reviews:
    text_object = NRCLex(review)
    data = text_object.raw_emotion_scores
    af = text_object.affect_frequencies
    
    emotions_scores.append(af)

In [14]:
# Convert emotion list into a dataframe
emotions_df = pd.DataFrame(emotions_scores)
emotions_df.head()

Unnamed: 0,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,anticipation
0,0.041667,0.0,0.0,0.208333,0.041667,0.208333,0.041667,0.083333,0.041667,0.166667,0.166667
1,0.090909,0.045455,0.0,0.227273,0.045455,0.181818,0.136364,0.045455,0.090909,0.045455,0.090909
2,0.0,0.0,0.0,0.25,0.0,0.25,0.25,0.25,0.0,0.0,
3,0.037037,0.037037,0.0,0.222222,0.074074,0.222222,0.111111,0.037037,0.037037,0.111111,0.111111
4,0.090909,0.090909,0.0,0.0,0.090909,0.272727,0.181818,0.181818,0.090909,0.0,


In [15]:
# Drop columns with no variance or high count of null values
emotions_df.drop(columns=['anticipation','anticip'], axis=1 , inplace = True)

In [16]:
# Add column for sum of emotion scores

emotions_df['sum'] = emotions_df['fear'] + emotions_df['anger'] + emotions_df['trust'] + emotions_df['surprise'] +emotions_df['positive'] + emotions_df['negative'] + emotions_df['disgust'] + emotions_df['joy']


In [17]:
emotions_df.describe()

Unnamed: 0,fear,anger,trust,surprise,positive,negative,sadness,disgust,joy,sum
count,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0
mean,0.066144,0.048305,0.175639,0.051055,0.255233,0.131423,0.057217,0.033321,0.079877,0.840996
std,0.067437,0.058222,0.119319,0.062377,0.137071,0.103301,0.063653,0.04881,0.069245,0.101838
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.1,0.0,0.166667,0.0625,0.0,0.0,0.0,0.789474
50%,0.0625,0.035714,0.166667,0.043478,0.238095,0.125,0.052632,0.0,0.076923,0.842105
75%,0.111111,0.083333,0.235294,0.083333,0.333333,0.1875,0.1,0.0625,0.125,0.9
max,0.5,0.5,1.0,1.0,1.0,1.0,1.0,0.5,0.5,1.0


In [18]:
print(emotions_df.shape)
print(review_df.shape)

(8800, 10)
(8800, 15)


In [19]:
# Reset the index of review_df
review_df = review_df.reset_index()

In [20]:
# Merge the review_df to the emotions dataframe
df_merged = review_df.merge(emotions_df, left_index = True, right_index = True)

df_merged.describe()

Unnamed: 0,index,season,rank,critic_meta_score,meta_score,review_length,fear,anger,trust,surprise,positive,negative,sadness,disgust,joy,sum
count,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0,8800.0
mean,23740.293182,1.44358,1394.172386,67.369432,68.101364,376.923182,0.066144,0.048305,0.175639,0.051055,0.255233,0.131423,0.057217,0.033321,0.079877,0.840996
std,14399.885979,1.361988,846.580083,19.197374,13.119537,65.034272,0.067437,0.058222,0.119319,0.062377,0.137071,0.103301,0.063653,0.04881,0.069245,0.101838
min,2.0,1.0,1.0,0.0,15.0,300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11116.75,1.0,654.0,50.0,60.0,326.0,0.0,0.0,0.1,0.0,0.166667,0.0625,0.0,0.0,0.0,0.789474
50%,22998.5,1.0,1364.0,70.0,69.0,360.0,0.0625,0.035714,0.166667,0.043478,0.238095,0.125,0.052632,0.0,0.076923,0.842105
75%,35510.5,1.0,2074.0,80.0,78.0,411.0,0.111111,0.083333,0.235294,0.083333,0.333333,0.1875,0.1,0.0625,0.125,0.9
max,52859.0,21.0,3138.0,100.0,99.0,793.0,0.5,0.5,1.0,1.0,1.0,1.0,1.0,0.5,0.5,1.0


In [21]:
# Filter out reviews with low total frequency of emotion scores
df_merged = df_merged[df_merged['sum']>=.25]
df_merged.shape

(8777, 26)

In [26]:
# Select input features 
x_a_list = ['fear','anger', 'trust', 'surprise', 'positive', 'negative', 'sadness',
       'disgust', 'joy']


x_a = df_merged[x_a_list]
x_a.index.rename('index', inplace = True)


# Create dataframe for optional input features
x_b_list = ['author','review_source','title','season', 'release_date', 'review_length']

x_b = df_merged[x_b_list]
x_b.index.rename('index', inplace = True)


# Select model target
y_list = ['critic_meta_score']

y = df_merged[y_list]
y.index.rename('index', inplace = True)

In [29]:
#Export dataframe to .csv files
df_merged.to_csv('tv_emotions_data.csv')

x_a.to_csv('emotion_scores.csv')
x_b.to_csv('review_data.csv')
y.to_csv('critic_meta_score.csv')
