In [265]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


#Metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import adjusted_rand_score


# For Data Normalization and Preprocessing
from sklearn import preprocessing
import re, string, unicodedata
import contractions
import nltk
import inflect
# from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from nltk.tokenize import sent_tokenize, word_tokenize


#For Training Multinomial Naive Bayess
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV


# Other Model
from sklearn.cluster import KMeans


In [222]:
df = pd.read_csv('../data/airline_tweets.csv')

In [223]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [224]:
# Existing Airlines
df["airline"].unique()

array(['Virgin America', 'United', 'Southwest', 'Delta', 'US Airways',
       'American'], dtype=object)

In [225]:
# Sentiment Classes
target_labels = df["airline_sentiment"].unique()
print(target_labels)

['neutral' 'positive' 'negative']


## All Tweets

In [226]:
tweets = df["text"]

## Target Label Column

In [227]:
airline_sentiment = df["airline_sentiment"]

## Remove Contractions

In [228]:
df.apply(lambda row: contractions.fix(row['text']), axis=1)

# for tweet in tweets:
#     contractions.fix(tweet)
#     
# print(tweets)

0                      @VirginAmerica What @dhepburn said.
1        @VirginAmerica plus you have added commercials...
2        @VirginAmerica I did not today... Must mean I ...
3        @VirginAmerica it is really aggressive to blas...
4        @VirginAmerica and it is a really big bad thin...
5        @VirginAmerica seriously would pay $30 a fligh...
6        @VirginAmerica yes, nearly every time I fly VX...
7        @VirginAmerica Really missed a prime opportuni...
8         @virginamerica Well, I did not…but NOW I DO! :-D
9        @VirginAmerica it was amazing, and arrived an ...
10       @VirginAmerica did you know that suicide is th...
11       @VirginAmerica I &lt;3 pretty graphics. so muc...
12       @VirginAmerica This is such a great deal! Alre...
13       @VirginAmerica @virginmedia I am flying your #...
14                                  @VirginAmerica Thanks!
15           @VirginAmerica SFO-PDX schedule is still MIA.
16       @VirginAmerica So excited for my first cross c.

In [229]:
sample = "didn't"
print(contractions.fix(sample))

did not


## Tokenization

In [230]:

tweets = df.apply(lambda row: word_tokenize(row['text']), axis=1)
# print(tweets)


## Data Preprocessing
* Normalization
* Remove contractions
* Stemming/lemmatization

In [231]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

tweets = [normalize(tweet) for tweet in tweets]

## Train Test Split

In [251]:
X_train, X_test, y_train, y_test = train_test_split(tweets, airline_sentiment, test_size=0.33, random_state=42)

## Count Vectorizer

In [252]:
X_train = pd.Series( (v[0] for v in X_train) )

tweet_vector = X_train

# print(tweet_vector)

count_vect = CountVectorizer()
X_train_tf = count_vect.fit_transform(tweet_vector)


In [253]:
# print(vector.vocabulary_)

In [254]:
# print(vector.get_feature_names())

## TFIDF

In [255]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)

print(X_train_tfidf.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Naive Bayes: MultinomialNB

* Take in TFIDF and our target labels

In [256]:
# Just learning, no transformation
classifier = MultinomialNB().fit(X_train_tfidf, y_train)

In [257]:
# Seperate data based on tweets about which airline:

virgin_america_tweets = df.loc[df['airline'] == 'Virgin America']["text"]
united_tweets = df.loc[df['airline'] == 'United']["text"]
southwest_tweets = df.loc[df['airline'] == 'Southwest']["text"]
delta_tweets = df.loc[df['airline'] == 'Delta']["text"]
us_airways_tweets = df.loc[df['airline'] == 'US Airways']["text"]
american_tweets = df.loc[df['airline'] == 'American']["text"]

In [258]:
# Testing with just Virgin America

X_test_tf = count_vect.transform(y_test)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)
predicted = classifier.predict(X_test_tfidf)

In [263]:
from sklearn.metrics import classification_report

# virgin_america_sentiment = df.loc[df['airline'] == 'Virgin America']["airline_sentiment"]
print("Accuracy:", accuracy_score(y_test, predicted))

print(metrics.classification_report(y_test, predicted))

Accuracy: 0.6384519867549668
              precision    recall  f1-score   support

    negative       0.64      1.00      0.78      3085
     neutral       0.00      0.00      0.00       984
    positive       0.00      0.00      0.00       763

   micro avg       0.64      0.64      0.64      4832
   macro avg       0.21      0.33      0.26      4832
weighted avg       0.41      0.64      0.50      4832



  'precision', 'predicted', average, warn_for)


In [118]:
# text_clf = Pipeline([('vect', Co untVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', MultinomialNB())])


# tuned_parameters = {
#     'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
#     'clf__alpha': [1, 1e-1, 1e-2]
# }

In [264]:
# from sklearn.metrics import classification_report
# clf = GridSearchCV(text_clf, tuned_parameters, cv=10)
# clf.fit(x_train, y_train)

# print(classification_report(y_test, clf.predict(x_test), digits=4))

## Machine learning steps (Thoughts, what I need to do, ideas)

Normalization

Text Preprocessing
* Tokenization
* Feature Selection: One crucial point you need to keep in mind while working in sentiment analysis is not all the words in a phrase convey the sentiment of the phrase. Words like "I", "Are", "Am", etc. do not contribute to conveying any kind of sentiments and hence, they are not relative in a sentiment classification context. Consider the problem of feature selection here. In feature selection, you try to figure out the most relevant features that relate the most to the class label. That same idea applies here as well.
* Stemming and Lemenization aka(Word Normalization)


Bag of words

TFIDF

- In tokenaization we came across various words such as punctuation,stop words(is,in,that,can etc),upper case words and lower case words.After tokenization we are not focused on text level but on word level. So by doing stemming,lemmatization we can convert tokenize word to more meaningful words . For example — [‘‘ross’, ‘128’, ‘earth’, ‘like’, ‘planet’ , ‘survive’, ‘planet’]. As we can see that all the punctuation and stop word is removed which makes data more meaningful

MultinomialNB

A couple approaches we can do is to do sentiment analysis by:
* Lexicon look up for each word, polarity score
* bag_of_words for each document
* TFIDF

What are the Pros and Cons of Naive Bayes?

Pros:

It is easy and fast to predict class of test data set. It also perform well in multi class prediction
When assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data.
It perform well in case of categorical input variables compared to numerical variable(s). For numerical variable, normal distribution is assumed (bell curve, which is a strong assumption).


Cons:

If categorical variable has a category (in test data set), which was not observed in training data set, then model will assign a 0 (zero) probability and will be unable to make a prediction. This is often known as “Zero Frequency”. To solve this, we can use the smoothing technique. One of the simplest smoothing techniques is called Laplace estimation.
On the other side naive Bayes is also known as a bad estimator, so the probability outputs from predict_proba are not to be taken too seriously.
Another limitation of Naive Bayes is the assumption of independent predictors. In real life, it is almost impossible that we get a set of predictors which are completely independent.