In [41]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



# For Data Normalization and Preprocessing
from sklearn import preprocessing
import re, string, unicodedata
import contractions
import nltk
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer




from sklearn.naive_bayes import MultinomialNB


## Plans

* Do sentiment analysis on each airline
* Which is the best airline?
* Use GridSearchCv to tune Naive Bayes
* Model Evaluation

In [42]:
df = pd.read_csv('../data/airline_tweets.csv')

In [43]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [44]:
df["airline"].unique()

array(['Virgin America', 'United', 'Southwest', 'Delta', 'US Airways',
       'American'], dtype=object)

In [45]:
tweets = df["text"]

## Remove Contractions

In [48]:
for tweet in tweets:
    contractions.fix(tweet)
    
# print(tweets)

In [51]:
sample = "didn't"
print(contractions.fix(sample))

did not


## Tokenization

In [None]:
words = nltk.word_tokenize(sample)


## Data Preprocessing
* Normalization
* Remove contractions
* Stemming/lemmatization

In [30]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

words = normalize(words)

NameError: name 'words' is not defined

## Count Vectorizer

In [24]:
tweet_vector = df["text"]

# print(tweet_vector)

vector = CountVectorizer()
vector.fit(tweet_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [25]:
# print(vector.vocabulary_)

In [26]:
# print(vector.get_feature_names())

In [27]:
vector.transform(tweet_vector)

<14640x15051 sparse matrix of type '<class 'numpy.int64'>'
	with 234281 stored elements in Compressed Sparse Row format>

In [7]:
df = df[['tweet_created_at','tweet_favorite_count','tweet_favorited','tweet_full_text']]

In [8]:
df.head()

Unnamed: 0,tweet_created_at,tweet_favorite_count,tweet_favorited,tweet_full_text
0,Fri Sep 07 16:25:06 +0000 2018,0,False,Done is better than perfect. — Sheryl Sandberg...
1,Fri Sep 07 16:24:59 +0000 2018,0,False,Shout out to the Great Fire Department and the...
2,Fri Sep 07 16:24:50 +0000 2018,0,False,There are some AMAZINGLY hilarious Nike Ad mem...
3,Fri Sep 07 16:24:44 +0000 2018,0,False,#kapernickeffect #swoosh #justdoit @ Lucas Bis...
4,Fri Sep 07 16:24:39 +0000 2018,0,False,"One Hand, One Dream: The Shaquem Griffin Story..."


## Machine learning steps (Thoughts, what I need to do, ideas)

*Bag of words

*TFIDF

*Word2Vec

*Tokenization

*Tokenization in python can be done by python’s NLTK library’s word_tokenize() function

*Normalization

- In tokenaization we came across various words such as punctuation,stop words(is,in,that,can etc),upper case words and lower case words.After tokenization we are not focused on text level but on word level. So by doing stemming,lemmatization we can convert tokenize word to more meaningful words . For example — [‘‘ross’, ‘128’, ‘earth’, ‘like’, ‘planet’ , ‘survive’, ‘planet’]. As we can see that all the punctuation and stop word is removed which makes data more meaningful

Text Preprocessing
* Tokenization
* Feature Selection: One crucial point you need to keep in mind while working in sentiment analysis is not all the words in a phrase convey the sentiment of the phrase. Words like "I", "Are", "Am", etc. do not contribute to conveying any kind of sentiments and hence, they are not relative in a sentiment classification context. Consider the problem of feature selection here. In feature selection, you try to figure out the most relevant features that relate the most to the class label. That same idea applies here as well.
* Stemming and Lemenization aka(Word Normalization)


A couple approaches we can do is to do sentiment analysis by:
* Lexicon look up for each word, polarity score
* bag_of_words for each document
* TFIDF

What are the Pros and Cons of Naive Bayes?
Pros:

It is easy and fast to predict class of test data set. It also perform well in multi class prediction
When assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data.
It perform well in case of categorical input variables compared to numerical variable(s). For numerical variable, normal distribution is assumed (bell curve, which is a strong assumption).
Cons:

If categorical variable has a category (in test data set), which was not observed in training data set, then model will assign a 0 (zero) probability and will be unable to make a prediction. This is often known as “Zero Frequency”. To solve this, we can use the smoothing technique. One of the simplest smoothing techniques is called Laplace estimation.
On the other side naive Bayes is also known as a bad estimator, so the probability outputs from predict_proba are not to be taken too seriously.
Another limitation of Naive Bayes is the assumption of independent predictors. In real life, it is almost impossible that we get a set of predictors which are completely independent.