## Airline Sentimental Analysis

- Load libraries

In [38]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from matplotlib import pyplot as plt

from nltk.tokenize import TweetTokenizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from textblob import TextBlob
import seaborn as sns

In [39]:
# read in the data 
df = pd.read_csv('data/A_tweets.csv') 
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [40]:
tokenizer = TweetTokenizer()

def clear_text(text):
    # tokenize the text
    # nltk.download('punkt') # uncomment if you need to download the punkt package
    tokens = tokenizer.tokenize(text)
    
    # remove all tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
    # make lowercase
    tokens = [word.lower() for word in tokens]

    # remove all tokens that are only one character
    tokens = [word for word in tokens if len(word) > 1]

    return ' '.join(tokens)

df['clean_text']=df.text.apply(lambda x: clear_text(x))   
df['hashtags'] = df.text.apply(lambda text: re.findall(r"#(\w+)", text))
df['handles'] = df.text.apply(lambda text: re.findall(r"@(\w+)", text))

In [41]:
df.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone', 'clean_text', 'hashtags', 'handles'],
      dtype='object')

In [42]:
df = df.drop(columns=["tweet_id", "name", "retweet_count",'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence','airline_sentiment_gold', 'negativereason_gold',
       'tweet_coord', 'tweet_created','tweet_location', 'user_timezone']) # though we could use these columns for other analysis, for this analysis we will not

In [43]:
df.head(5)

Unnamed: 0,airline_sentiment,airline,text,clean_text,hashtags,handles
0,neutral,Virgin America,@VirginAmerica What @dhepburn said.,what said,[],"[VirginAmerica, dhepburn]"
1,positive,Virgin America,@VirginAmerica plus you've added commercials t...,plus added commercials to the experience tacky,[],[VirginAmerica]
2,neutral,Virgin America,@VirginAmerica I didn't today... Must mean I n...,today must mean need to take another trip,[],[VirginAmerica]
3,negative,Virgin America,@VirginAmerica it's really aggressive to blast...,really aggressive to blast obnoxious entertain...,[],[VirginAmerica]
4,negative,Virgin America,@VirginAmerica and it's a really big bad thing...,and really big bad thing about it,[],[VirginAmerica]


In [44]:
df["airline_sentiment"].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

### Sentimental analysis using Textblob

In [45]:
analyzer = SentimentIntensityAnalyzer()

df['tb_polarity'] = df['clean_text'].apply(lambda x: TextBlob(x).polarity)
df['tb_sentiment'] = df['tb_polarity'].apply(lambda x: 'positive' if x >0 else('nuetral' if x==0 else 'negative') )

df.sample(n=10)

Unnamed: 0,airline_sentiment,airline,text,clean_text,hashtags,handles,tb_polarity,tb_sentiment
3417,negative,United,@united whom can I call to discuss - as I was ...,whom can call to discuss as was told that unit...,[],[united],-0.5,negative
4429,negative,Southwest,@SouthwestAir I booked a flight on my phone an...,booked flight on my phone and then never got c...,[],[SouthwestAir],0.0,nuetral
9446,negative,US Airways,"@USAirways flights &amp; ""customer relations"" ...",flights customer relations are extremely disap...,[],[USAirways],-0.05,negative
2586,neutral,United,@united If you'd love to see more girls be ins...,if love to see more girls be inspired about be...,[],[united],0.4625,positive
9273,negative,US Airways,@USAirways has me on my toes whether I'm goin...,has me on my toes whether going to make my fli...,[],[USAirways],0.0,nuetral
12059,neutral,American,@AmericanAir my flight got Cancelled Flightled...,my flight got cancelled flightled from grk to ...,[],[AmericanAir],0.0,nuetral
7967,negative,Delta,@JetBlue were boarding now I'm really not impr...,were boarding now really not impressed learnt ...,[],[JetBlue],-0.15,negative
6550,neutral,Southwest,@SouthwestAir employees spreading a bit of #Ma...,employees spreading bit of cheer at this morni...,"[MardiGras, NFTYConvention]",[SouthwestAir],0.0,nuetral
272,positive,Virgin America,@VirginAmerica your inflight team makes the ex...,your inflight team makes the experience,[amazing],[VirginAmerica],0.0,nuetral
10290,negative,US Airways,@USAirways No US Air ppl anywhere in PHL direc...,no us air ppl anywhere in phl directed strande...,[],[USAirways],0.0,nuetral


### Sentimental analysis using NLTK

In [46]:
analyzer = SentimentIntensityAnalyzer()

df['nltk_polarity'] = df['clean_text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
df['nltk_sentiment'] = df['nltk_polarity'].apply(lambda x: 'positive' if x >0 else('nuetral' if x==0 else 'negative'))

df.sample(n=10)

Unnamed: 0,airline_sentiment,airline,text,clean_text,hashtags,handles,tb_polarity,tb_sentiment,nltk_polarity,nltk_sentiment
4969,neutral,Southwest,@SouthwestAir @TheAcademy party in #hotlanta h...,party in,[hotlanta],"[SouthwestAir, TheAcademy]",0.0,nuetral,0.4019,positive
13651,negative,American,@AmericanAir still waiting on a dm response......,still waiting on dm response,[sloooowresponses],[AmericanAir],0.0,nuetral,0.0,nuetral
12013,negative,American,@AmericanAir @lpalumbo what weather sun is out,what weather sun is out,[],"[AmericanAir, lpalumbo]",0.0,nuetral,0.0,nuetral
5802,negative,Southwest,"@SouthwestAir, what do you know? Finally lined...",what do you know finally lined up like cattle ...,[badcustomerservice],[SouthwestAir],0.1875,positive,0.5859,positive
10241,negative,US Airways,@USAirways ever think about hiring more agents...,ever think about hiring more agents come on no...,[],[USAirways],-0.006944,negative,-0.2975,negative
13775,negative,American,"@AmericanAir @dogbuckeye No, I was on hold for...",no was on hold for hours diff agents each diff...,[],"[AmericanAir, dogbuckeye]",-0.2,negative,-0.6249,negative
4394,negative,Southwest,@SouthwestAir @SMiles1307 over two hours now. ...,over two hours now ugh we should all get vouch...,[],"[SouthwestAir, SMiles1307]",0.5,positive,-0.1531,negative
2249,neutral,United,@united I was on UA1069 today and left my sung...,was on today and left my sunglasses in seat pl...,[UnitedAirlines],[united],0.0,nuetral,0.6124,positive
10428,positive,US Airways,@USAirways thank you!!!,thank you,[],[USAirways],0.0,nuetral,0.3612,positive
4248,negative,United,@united @NY_NJairports Only at Newark can you ...,only at newark can you land minutes early but ...,[],"[united, NY_NJairports]",0.05,positive,-0.5499,negative


In [47]:
import numpy as np
print(np.corrcoef(df.tb_polarity, df.nltk_polarity))      # Correlation

[[1.        0.5602282]
 [0.5602282 1.       ]]


In [48]:
df[['airline_sentiment','tb_sentiment','nltk_sentiment']]

Unnamed: 0,airline_sentiment,tb_sentiment,nltk_sentiment
0,neutral,nuetral,nuetral
1,positive,nuetral,nuetral
2,neutral,negative,nuetral
3,negative,positive,negative
4,negative,negative,negative
...,...,...,...
14635,positive,nuetral,positive
14636,negative,negative,negative
14637,neutral,nuetral,positive
14638,negative,negative,positive


In [49]:
print(f"airline_blob_positive_words:- {df['airline_sentiment'].value_counts().get('positive', 0)}")
print(f"text_blob_positive_words:- {df['tb_sentiment'].value_counts().get('positive', 0)}")
print(f"nltk_blob_positive_words:- {df['nltk_sentiment'].value_counts().get('positive', 0)}")
print(f"airline_negative_words:- {df['airline_sentiment'].value_counts().get('negative', 0)}")
print(f"text_blob_negative_words:- {df['tb_sentiment'].value_counts().get('negative', 0)}")
print(f"nltk_negative_words:- {df['nltk_sentiment'].value_counts().get('negative', 0)}")

airline_blob_positive_words:- 2363
text_blob_positive_words:- 5466
nltk_blob_positive_words:- 6348
airline_negative_words:- 9178
text_blob_negative_words:- 3493
nltk_negative_words:- 5082


In [52]:
from sklearn.metrics import accuracy_score

In [60]:
accuracy_tb = accuracy_score(df['airline_sentiment'], df['tb_sentiment'])

print(f'Accuracy of text blob: {accuracy_tb:.2f}')  

Accuracy of text blob: 0.33


In [61]:
accuracy_nltk = accuracy_score(df['airline_sentiment'], df['nltk_sentiment'])

print(f'Accuracy of nltk: {accuracy_nltk:.2f}')  

Accuracy of nltk: 0.45


- summary:
From above we can see that the text blob accuracy is about 0.33 where nltk is 0.45 so from this we can say that nltk sentiment analyzer is better than textblob.