# Twitter US Airline Sentiment Analysis

## Import Dependencies

In [1]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omardoma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/omardoma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  from numpy.core.umath_tests import inner1d


## Read the Dataset

In [2]:
tweets = pd.read_csv("Tweets.csv")

## Show a sample of the tweets

In [3]:
tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


## Show the tweets' info

In [4]:
print(tweets.shape)
print(tweets.airline_sentiment.unique())

(14640, 15)
['neutral' 'positive' 'negative']


## Filter text tweets

In [5]:
filtered_tweets = tweets[tweets['text'].apply(lambda x: "RT" not in x) & tweets['text'].apply(lambda x: len(x) >= 20)]
filtered_tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


## Save the labels {positive, negative, neutral} of each tweet to a list

In [6]:
positive_tweets = filtered_tweets[filtered_tweets['airline_sentiment'] == 'positive']
neutral_tweets = filtered_tweets[filtered_tweets['airline_sentiment'] == 'neutral']
negative_tweets = filtered_tweets[filtered_tweets['airline_sentiment'] == 'negative']


## Define a function that preprocess a tweet

In [7]:
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))
def clean_tweet(tweet):
    # URL & Mentions Removal
    cleaned_tweet = re.sub('((www\S+)|(http\S+)|(@\S+))', '', tweet)
    # Tokenization
    words = word_tokenize(cleaned_tweet)
    # Case Folding & Normalization
    words = [word.casefold() for word in words if word.isalpha()]
    # Stemming
    words = [porter.stem(word) for word in words]
    # Stop Words Removal
    words = [word for word in words if not word in stop_words]
    return ' '.join(words)


## Split the dataset into 80% training and 20% testing

In [8]:
train, test = train_test_split(filtered_tweets, test_size = 0.2)

## Do some preprocessing on the tweets text

In [9]:
train_clean_tweets = train['text'].apply(lambda tweet: clean_tweet(tweet)).values
test_clean_tweets = test['text'].apply(lambda tweet: clean_tweet(tweet)).values

## Extract the features

In [10]:
v = CountVectorizer()
train_features = v.fit_transform(train_clean_tweets)
test_features = v.transform(test_clean_tweets)

## Classify using Multinomial Naive Bayes

In [11]:
clf = MultinomialNB()
clf.fit(train_features, train['airline_sentiment'])
predictions = clf.predict(test_features)

### Compute the F1-score

In [12]:
f1_score(test['airline_sentiment'], predictions, average='micro')

0.7563522450400278

## Classify using KNeighborsClassifier

In [13]:
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(train_features, train['airline_sentiment']) 
predictions = clf.predict(test_features)

### Compute the F1-score

In [14]:
f1_score(test['airline_sentiment'], predictions, average='micro')

0.6157326836059868

## Classify using Random Forest Classifier

In [15]:
clf = RandomForestClassifier(random_state=0)
clf.fit(train_features, train['airline_sentiment'])
predictions = clf.predict(test_features)

### Compute the F1-score

In [16]:
f1_score(test['airline_sentiment'], predictions, average='micro')

0.738600765750087

# Bonus 4

## Split the dataset into 80% training and 20% testing using non filtered tweets


In [17]:
train, test = train_test_split(tweets, test_size = 0.2)

## Do some preprocessing on the tweets text

In [18]:
train_clean_tweets = train['text'].apply(lambda tweet: clean_tweet(tweet)).values
test_clean_tweets = test['text'].apply(lambda tweet: clean_tweet(tweet)).values

## Extract the features

In [19]:
v = CountVectorizer()
train_features = v.fit_transform(train_clean_tweets)
test_features = v.transform(test_clean_tweets)

## Classify using Multinomial Naive Bayes

In [20]:
clf = MultinomialNB()
clf.fit(train_features, train['airline_sentiment'])
predictions = clf.predict(test_features)

### Compute the F1-score

In [21]:
f1_score(test['airline_sentiment'], predictions, average='micro')

0.762636612021858

## Classify using KNeighborsClassifier

In [22]:
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(train_features, train['airline_sentiment']) 
predictions = clf.predict(test_features)

### Compute the F1-score

In [23]:
f1_score(test['airline_sentiment'], predictions, average='micro')

0.6294398907103825

## Classify using Random Forest Classifier

In [24]:
clf = RandomForestClassifier(random_state=0)
clf.fit(train_features, train['airline_sentiment'])
predictions = clf.predict(test_features)

### Compute the F1-score

In [25]:
f1_score(test['airline_sentiment'], predictions, average='micro')

0.7540983606557377

# Bonus 3 

## Read Training Data

In [26]:
training_tweets = pd.read_csv("training.csv", encoding = "ISO-8859-1", header=None)

## Show a sample of the training data

In [27]:
training_tweets.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Read Testing Data

In [28]:
testing_tweets = pd.read_csv("testdata.csv", encoding = "ISO-8859-1", header=None)

## Show a sample of the testing data

In [29]:
testing_tweets.head()

Unnamed: 0,0,1,2,3,4,5
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


## Do some preprocessing on the tweets text

In [30]:
train_clean_tweets = training_tweets[5].apply(lambda tweet: clean_tweet(tweet)).values
test_clean_tweets = testing_tweets[5].apply(lambda tweet: clean_tweet(tweet)).values

## Extract the features

In [31]:
v = CountVectorizer()
train_features = v.fit_transform(train_clean_tweets)
test_features = v.transform(test_clean_tweets)

## Classify using Multinomial Naive Bayes

In [32]:
clf = MultinomialNB()
clf.fit(train_features, training_tweets[0])
predictions = clf.predict(test_features)

### Compute the F1-score

In [33]:
f1_score(testing_tweets[0], predictions, average='micro')

0.5843373493975904

## Classify using KNeighborsClassifier

In [34]:
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(train_features, training_tweets[0]) 
predictions = clf.predict(test_features)

### Compute the F1-score

In [35]:
f1_score(testing_tweets[0], predictions, average='micro')

0.42771084337349397

## Classify using Random Forest Classifier

In [None]:
clf = RandomForestClassifier(random_state=0)
clf.fit(train_features, training_tweets[0])
predictions = clf.predict(test_features)

### Compute the F1-score

In [None]:
f1_score(testing_tweets[0], predictions, average='micro')