https://www.kaggle.com/crowdflower/twitter-airline-sentiment/data

In [1]:
import re
import itertools
import pandas as pd

from nltk.corpus import stopwords
from pandas_ml import ConfusionMatrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [13]:
stops = set(stopwords.words("english")) 

In [2]:
dataset = pd.read_csv("./datasets/Tweets.csv")

In [3]:
dataset.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
dataset.shape

(14640, 15)

In [7]:
dataset["airline_sentiment"].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [8]:
dataset = dataset[["text", "airline_sentiment"]]

In [9]:
dataset.columns = ["text", "sentiment"]

In [10]:
dataset.head()

Unnamed: 0,text,sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [14]:
def clean_tweet(tweet):
    tweet = tweet.lower() # convert tweets to lower case
    tweet = " ".join(filter(lambda x:x[0]!='@', tweet.split())) # remove user ids from the tweet
    tweet = " ".join(filter(lambda x:x[0]!='#', tweet.split())) # remove hash tags from the tweet
    tweet = re.sub("[^a-zA-Z]+", " ", tweet) # keep only alphabets
    tweet = " ".join([w for w in tweet.split() if not w in stops]) # remove stop words
    return tweet

In [15]:
dataset["clean_tweet"] = dataset["text"].apply(clean_tweet)

In [16]:
dataset.head()

Unnamed: 0,text,sentiment,clean_tweet
0,@VirginAmerica What @dhepburn said.,neutral,said
1,@VirginAmerica plus you've added commercials t...,positive,plus added commercials experience tacky
2,@VirginAmerica I didn't today... Must mean I n...,neutral,today must mean need take another trip
3,@VirginAmerica it's really aggressive to blast...,negative,really aggressive blast obnoxious entertainmen...
4,@VirginAmerica and it's a really big bad thing...,negative,really big bad thing


In [27]:
def find_ngrams(text, n):
    return list(itertools.combinations(text, n))

In [61]:
D = {}

In [62]:
def create_dict(row):
    value = 0
    if row["sentiment"] == "positive":
        value = 1
    elif row["sentiment"] == "negative":
        value = -1
    tweet = row["clean_tweet"]
    tweet = tweet.split()
    ngrams = []
    for i in range(1,4):
        ngrams = ngrams + find_ngrams(tweet, i)
    ngrams = [" ".join(i) for i in ngrams]
    for gram in ngrams:
        if gram in D:
            D[gram] += value
        else:
            D[gram] = value

In [65]:
train_data, test_data = train_test_split(dataset, test_size = 0.2, random_state = 82, stratify = dataset["sentiment"])

In [66]:
train_data.shape

(11712, 3)

In [67]:
test_data.shape

(2928, 3)

In [68]:
_ = train_data.apply(create_dict, axis = 1)

In [69]:
len(D)

1777160

In [71]:
dict(list(D.items())[0:20])

{'airline customer ceo': -1,
 'also change also': -1,
 'atlantic gold get': -1,
 'baejet jetblue back': 0,
 'delay original hour': -1,
 'flights wednesday night': 0,
 'guys ticket stay': 0,
 'hi clo lax': 0,
 'hold reservation f': -1,
 'hot mess': -1,
 'hrs wait service': -1,
 'list red head': -1,
 'meeting pm took': -1,
 'missing nd': -2,
 'people freezing beyond': -1,
 'phone airport yesterday': -1,
 'pilot get': -1,
 'problem say anything': -1,
 'tonight asset team': 2,
 'travel snow': 0}

In [82]:
def predict_sentiment(tweet):
    tweet = tweet.split()
    ngrams = []
    for i in range(1,4):
        ngrams = ngrams + find_ngrams(tweet, i)
    ngrams = [" ".join(i) for i in ngrams]
    s = 0
    for gram in ngrams:
        if gram in D:
            value = D[gram]
            if value < 0:
                s -= 1
            elif value > 0:
                s += 1
    if s < 0:
        return "negative"
    elif s > 0:
        return "positive"
    else:
        return "neutral"

In [85]:
test_data["prediction"] = test_data["clean_tweet"].apply(predict_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [86]:
test_data.head()

Unnamed: 0,text,sentiment,clean_tweet,prediction
2227,"@united Well, to the degree that he could... J...",negative,well degree could know experience cancelled fl...,negative
2059,@united what's with the layover in Canada from...,neutral,layover canada ua scheduled,negative
3749,@united I will be stunned if my bags are in Ha...,negative,stunned bags hartford ord ground crew hour mov...,negative
8130,@JetBlue U said 15mins to Take Off and now we ...,negative,u said mins take told hr delay amp possible ca...,negative
1779,"@united - well, now finally on board hopefully...",negative,well finally board hopefully take time quickly...,negative


In [77]:
test_data["prediction"].value_counts()

negative    2719
positive     150
neutral       59
Name: prediction, dtype: int64

In [113]:
print("Accuracy on test set: {0:.2f}%".format(accuracy_score(test_data["sentiment"], test_data["prediction"])*100))

Accuracy on test set: 66.67%


In [114]:
ConfusionMatrix(test_data["sentiment"], test_data["prediction"])

Predicted  negative  neutral  positive  __all__
Actual                                         
negative       1827        5         4     1836
neutral         573       13        34      620
positive        319       41       112      472
__all__        2719       59       150     2928