In [1]:
import pandas as pd
import numpy as np
import json

from sklearn.metrics import precision_score, recall_score

# Read data and scores dict

In [3]:
df = pd.read_csv("../data/warm_up_data.csv")

In [4]:
with open('../data/AFINN-111-scores.json', 'r') as fp:
     scores_dict = json.load(fp)

List unique airlines within the dataframe

In [8]:
df.airline.unique()

array(['Virgin America', 'United', 'Southwest', 'Delta', 'US Airways',
       'American'], dtype=object)

# Data preparation

Convert airline sentiment column to numeric class

In [14]:
slist = df.airline_sentiment.unique()
sint = [-1, 0, 1]

sent_dict = dict(zip(slist, sint))
sent_dict

{'negative': -1, 'neutral': 0, 'positive': 1}

In [15]:
df['sentiment_class_true'] = df.airline_sentiment.apply(lambda x: sent_dict[x])

In [16]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,text,airline,sentiment_class_true
0,570301031407624196,negative,@VirginAmerica it's really aggressive to blast...,Virgin America,-1
1,570300817074462722,negative,@VirginAmerica and it's a really big bad thing...,Virgin America,-1
2,570300767074181121,negative,@VirginAmerica seriously would pay $30 a fligh...,Virgin America,-1
3,570300248553349120,neutral,@VirginAmerica Really missed a prime opportuni...,Virgin America,0
4,570295459631263746,positive,"@VirginAmerica it was amazing, and arrived an ...",Virgin America,1


# Assign text sentiment score based on scores dict

Write assign_score function, which will summarize scores of all words present within scores_dict and calculate sentiment_score_hat for each tweet

In [17]:
df.text[0].split()

['@VirginAmerica',
 "it's",
 'really',
 'aggressive',
 'to',
 'blast',
 'obnoxious',
 '"entertainment"',
 'in',
 'your',
 "guests'",
 'faces',
 '&amp;',
 'they',
 'have',
 'little',
 'recourse']

In [20]:
def assign_score(text):
    score = 0
    for word in text.split():
        score += scores_dict.get(word.lower(), 0)
    return score

In [21]:
assign_score(df.text[0])

-5

In [31]:
df["sentiment_score_hat"] = df.text.apply(assign_score)

In [32]:
df.describe()

Unnamed: 0,tweet_id,sentiment_class_true,sentiment_score_hat
count,9489.0,9489.0,9489.0
mean,5.69224e+17,-0.475919,0.28454
std,778424800000000.0,0.794741,2.853243
min,5.675883e+17,-1.0,-13.0
25%,5.685628e+17,-1.0,-2.0
50%,5.694956e+17,-1.0,0.0
75%,5.698846e+17,0.0,2.0
max,5.703106e+17,1.0,16.0


## Split sentiment scores to class

Write a classify_sentiment functions, which will assign label numeric labels [-1,0,1] for based on predicted sentiment score. Recommended use of pd.cut

In [36]:
def classify_sentiment(score_col, negative_th, positive_th):
    bins = [-100, negative_th, positive_th, 100]
    labels = [-1, 0, 1]
    score_class = pd.cut(score_col, bins=bins, labels=labels).astype(int)

    """assign class here here"""
    return score_class

In [37]:
classify_sentiment(df.sentiment_score_hat, -1,1)

0      -1
1      -1
2      -1
3       0
4       1
       ..
9484    0
9485   -1
9486    1
9487   -1
9488    0
Name: sentiment_score_hat, Length: 9489, dtype: int32

In [39]:
df["sentiment_class_hat"] = classify_sentiment(df.sentiment_score_hat, -1,1)

In [40]:
df.groupby('sentiment_class_hat', as_index=False).tweet_id.count()

Unnamed: 0,sentiment_class_hat,tweet_id
0,-1,4332
1,0,1607
2,1,3550


In [41]:
df.head()[['sentiment_class_true', 'sentiment_class_hat']]

Unnamed: 0,sentiment_class_true,sentiment_class_hat
0,-1,-1
1,-1,-1
2,-1,-1
3,0,0
4,1,1


In [28]:
df.sentiment_score_hat.describe()

count    9489.000000
mean        0.284540
std         2.853243
min       -13.000000
25%        -2.000000
50%         0.000000
75%         2.000000
max        16.000000
Name: sentiment_score_hat, dtype: float64

Create a correct_classification bool column defining if our prediction is correct

In [42]:
df["correct_classification"] = df.sentiment_class_true == df.sentiment_class_hat

In [43]:
df['correct_classification'].mean()

0.6059648013489304

## Evaluate classification performance per class

Evaluate accuracy by class

In [44]:
df.groupby('airline_sentiment').correct_classification.mean()

airline_sentiment
negative    0.609733
neutral     0.261321
positive    0.848869
Name: correct_classification, dtype: float64

# Select only negative and positive texts

Create df_np dataframe as subset of df excluding neutral (0) sentiment class

In [45]:
df_np = df.loc[df.airline_sentiment!='neutral']

In [46]:
df_np.head()

Unnamed: 0,tweet_id,airline_sentiment,text,airline,sentiment_class_true,sentiment_score_hat,sentiment_class_hat,correct_classification
0,570301031407624196,negative,@VirginAmerica it's really aggressive to blast...,Virgin America,-1,-5,-1,True
1,570300817074462722,negative,@VirginAmerica and it's a really big bad thing...,Virgin America,-1,-2,-1,True
2,570300767074181121,negative,@VirginAmerica seriously would pay $30 a fligh...,Virgin America,-1,-4,-1,True
4,570295459631263746,positive,"@VirginAmerica it was amazing, and arrived an ...",Virgin America,1,3,1,True
6,570289724453216256,positive,@VirginAmerica I &lt;3 pretty graphics. so muc...,Virgin America,1,3,1,True


In [47]:
df_np.shape

(8142, 8)

## Calculate Precision and Recall for classifying negative review

Precision = TP / TP + FP

Recall = TP / TP + FN

Create negative_review and negative_review_hat bool column for negative review detection classifier and calculate Precision and Recall

In [48]:
df_np["negative_review"] = df_np.sentiment_class_true==-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_np["negative_review"] = df_np.sentiment_class_true==-1


In [49]:
df_np["negative_review_hat"] =  np.where(df_np.sentiment_class_hat==-1,1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_np["negative_review_hat"] =  np.where(df_np.sentiment_class_hat==-1,1,0)


In [50]:
precision_score(df_np.negative_review, df_np.negative_review_hat)

0.9708176100628931

In [52]:
recall_score(df_np.negative_review, df_np.negative_review_hat)

0.6097329751935535

In [None]:
C:\Users\maxim\Documents\GitHub\ALK-NLP-course\W1 - Introduction\Python-warm-up-setiment-analysis-BLANK.ipynb