# Basic Sentiment Analysis using VADER 

Loading appropriate packages

In [145]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
import pandas as pd

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/monroefarris/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Reading in confessional data 

In [146]:
data_dir = "../data/"
season = "season_1"

conf_data = pd.read_csv(data_dir + season + '/confessionals_season_1.csv')
print('Data\n',conf_data.head())

num_talked = conf_data.groupby('Speaker').count().reset_index()
num_talked = num_talked[['Speaker', 'Episode']].rename(columns = {'Episode': 'Times Talked'}).reset_index()
print('\nNumber of times each player talked in a confessional\n', num_talked.head())

Data
    Episode  Day  Speaker                                               Text
0        1    1     Rudy  Paddling over, uh, we had two or three of thos...
1        1    1    Kelly  He was yelling at everybody “Let's lose the bo...
2        1    1   Ramona  I don't like being on the water all that much....
3        1    1     Dirk  Rich, um, I appreciate what he's trying to do ...
4        1    1  Richard  I'm good to go survival wise. People wise, it'...

Number of times each player talked in a confessional
    index  Speaker  Times Talked
0      0     B.B.            11
1      1  Colleen            54
2      2     Dirk            11
3      3  Gervase            56
4      4     Greg            35


### Basic Sentiment Analysis 

- Calculates average sentiment score for the confessional 
- Bins the aggregate score into either positive or negative 

In [147]:
# basic VADER sentiment analysis 
sid = SentimentIntensityAnalyzer()
conf_data['scores'] = conf_data['Text'].apply(lambda review: sid.polarity_scores(review))

# getting aggregate score for the confessional
conf_data['compound']  = conf_data['scores'].apply(lambda score_dict: score_dict['compound'])

# binning aggregate score into either positive or negative 
conf_data['comp_score'] = conf_data['compound'].apply(lambda c: 'positive' if c >=0 else 'negative')

### Summary Stats
- Returns average sentiment score for each player in a season 
- Returns number of confessionals per player that are negative or positive over the course of the season

In [148]:
# average sentiment score of a player's confessionals for a season
avg_sentiment = conf_data.groupby('Speaker')['compound'].mean().reset_index()
print(avg_sentiment[['Speaker', 'compound']].head())

# number of confessionals over the course of a season by language bin
lang_overall = conf_data.groupby(['Speaker', 'comp_score']).count().reset_index()
lang_overall = lang_overall.rename(columns = {'compound': 'counts'})
print(lang_overall[['Speaker', 'comp_score', 'counts']].head())

   Speaker  compound
0     B.B. -0.024155
1  Colleen  0.333902
2     Dirk  0.456345
3  Gervase  0.310037
4     Greg  0.542760
   Speaker comp_score  counts
0     B.B.   negative       5
1     B.B.   positive       6
2  Colleen   negative      11
3  Colleen   positive      43
4     Dirk   negative       3


# Tracking Player Mentions
- Goal: See how many times a speaker mentions other players, and look at sentiment of the statement 

### Gossip Score
- How often the speaker in the confessionals is talking about other players

In [149]:
# getting player names
speakers = conf_data['Speaker'].unique()

# counting number of times a player is mentioned 
for speaker in speakers:
    conf_data['count_' + speaker] = conf_data.Text.str.count(speaker)

mentions_df = conf_data.drop(columns = ['Text', 'scores'])

# getting total number of times the speaker mentioned another player 
mentions_df['total_mentions'] = mentions_df.iloc[:, -len(speakers):].sum(axis = 1)
total_mentions = mentions_df.groupby('Speaker')['total_mentions'].sum().reset_index()

# normalizing number of player mentions by number of times the player spoke in the confessional 
gossip_score = total_mentions.merge(num_talked, on = 'Speaker').reset_index(drop = True)
gossip_score['normalized_mentions'] = gossip_score['total_mentions'] / gossip_score['Times Talked']

# calculating gossip score 
gossip_score = gossip_score[['Speaker', 'normalized_mentions']].sort_values(by = ['normalized_mentions'], ascending=False).reset_index(drop= True)
gossip_score['gossip_score_idx'] = gossip_score.index
gossip_score['gossip_score'] = len(gossip_score) - gossip_score['gossip_score_idx']
print(gossip_score[['Speaker', 'gossip_score']].head())

   Speaker  gossip_score
0   Stacey            16
1   Ramona            15
2    Susan            14
3  Colleen            13
4  Gervase            12


### Popularity Score
- How often a player is mentioned by the other players

In [150]:
mentions_cols = mentions_df.iloc[: , -len(speakers):-1].columns
mentions_df = pd.melt(mentions_df, id_vars=['Episode', 'Day', 'Speaker'], value_vars = mentions_cols).reset_index(drop = True)

popularity_score = mentions_df.groupby('variable')['value'].sum().reset_index()
popularity_score = popularity_score.rename(columns = {'variable': 'player_mentioned'})
popularity_score[['tmp', 'player_mentioned']] = popularity_score['player_mentioned'].str.split('_', expand=True)

popularity_score = popularity_score[['player_mentioned', 'value']].sort_values(by = ['value'], ascending=False).reset_index(drop= True)
popularity_score['popularity_score_idx'] = popularity_score.index
popularity_score['popularity_score'] = len(popularity_score) - popularity_score['popularity_score_idx']

print(popularity_score[['player_mentioned', 'value', 'popularity_score']].head())

  player_mentioned  value  popularity_score
0            Kelly     52                15
1             Sean     52                14
2             Greg     31                13
3            Jenna     24                12
4          Gervase     18                11
