In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv('airline-train.csv')
test_data = pd.read_csv('airline-test.csv')


In [3]:
#preprocess

#import re
import nltk
#from nltk import stopwords
#nltk.download('punkt')
#nltk.download('stopwords')
#from string import punctuation 


import re
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords
from nltk.stem.porter import *

class PreProcessTweets:
    def __init__(self):
        self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
        
    def processTweets(self, list_of_tweets):
        processedTweets=[]
        text_of_tweet = list_of_tweets["text"]

                
        for tweet in text_of_tweet:
            processedTweets.append(( self._processTweet(tweet) ))
        return processedTweets
    
    def _processTweet(self, tweet):
        ps = PorterStemmer()
        tweet = tweet.lower() # convert text to lower-case
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
        tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
        tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
        return [ps.stem(word) for word in tweet if word not in self._stopwords and len(ps.stem(word))>=3]

In [4]:
tweetProcessor = PreProcessTweets()
preprocessed_train_data = tweetProcessor.processTweets(train_data)
preprocessed_test_data = tweetProcessor.processTweets(test_data)

train_data["text"] = preprocessed_train_data
test_data["text"] = preprocessed_test_data

In [5]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,...,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,11429,681462729,False,finalized,3,2/25/15 3:07,positive,1.0,,,...,,CaraModisett,,0,"[also, appreci, hashtag, lucycat]",,2/18/15 10:55,5.68122e+17,"Memphis, Tennessee",Central Time (US & Canada)
1,9717,681461003,False,finalized,3,2/25/15 3:20,negative,1.0,Can't Tell,0.3464,...,,LarrySandeen,,0,"[discov, bill, 300, reschedul, flight, cost, s...","[0.0, 0.0]",2/22/15 19:24,5.69699e+17,Southeastern Pennsylvania USA,
2,10153,681461443,False,finalized,3,2/25/15 6:20,negative,1.0,Can't Tell,1.0,...,,burseka,,0,"[guy, suck]",,2/22/15 7:28,5.69519e+17,,
3,975,681449647,False,finalized,3,2/25/15 2:14,negative,1.0,Can't Tell,1.0,...,,artistanxiety,,0,"[right, angri]",,2/23/15 14:09,5.69982e+17,Punk is the preacher.,Arizona
4,1511,681450373,False,finalized,3,2/25/15 6:57,negative,1.0,Can't Tell,0.6848,...,,axelrodaj,,0,"[sure, peopl, row, paid, premium, seat, n't, u...","[33.94077727, -118.39921036]",2/22/15 18:50,5.69691e+17,San Francisco,Pacific Time (US & Canada)


In [6]:
# feature selection with chi 2
#unique words of all tweets
import sklearn

unique_words = []

for i in range (0,len(train_data)):
    for word in train_data["text"][i] :
        if word not in unique_words:
            unique_words.append(word)


In [7]:
unique_words

['also',
 'appreci',
 'hashtag',
 'lucycat',
 'discov',
 'bill',
 '300',
 'reschedul',
 'flight',
 'cost',
 'second',
 'day',
 'lost',
 'work',
 'guy',
 'suck',
 'right',
 'angri',
 'sure',
 'peopl',
 'row',
 'paid',
 'premium',
 'seat',
 "n't",
 'use',
 'overhead',
 'space',
 '...',
 'cancel',
 'there\x89ûª',
 'way',
 'rebook',
 'websit',
 'app',
 'wait',
 'minut',
 'hold',
 'fail',
 'need',
 'help',
 'miss',
 'bag',
 'say',
 'deliv',
 'local',
 'number',
 'open',
 'ask',
 'main',
 'cust',
 'servic',
 'got',
 'hung',
 'start',
 'daili',
 'b777-200er',
 'newark',
 'frankfurt',
 'replac',
 'b767-400er',
 '2jul',
 'avgeek',
 'think',
 'problem',
 'saturday',
 'due',
 'expect',
 'snow',
 'anyon',
 'ua1740',
 'inbound',
 'dep',
 'time',
 'get',
 'closer',
 'push',
 'back',
 'min',
 "'ve",
 'thank',
 'respons',
 'hour',
 'realli',
 'frustrat',
 'seem',
 'like',
 'one',
 'jetblu',
 'concern',
 'hope',
 'destin',
 'guarante',
 "'ll",
 'see',
 'tomorrow',
 'morn',
 'fight',
 'look',
 'fix',
 '

In [8]:
# prepering input for chi 2
matrix_input = np.full((len(train_data),len(unique_words)), 0)

for i in range (0,len(train_data)):
    counter = 0
    for word in unique_words:
        if word in train_data["text"][i]:
            matrix_input[i][counter] = 1
        counter = counter +1    

lable_vector = train_data["airline_sentiment"]

matrix_input

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
matrix_input_pd = pd.DataFrame(data = matrix_input, columns = unique_words)
#matrix_input_pd

In [10]:
lable_vector

0       positive
1       negative
2       negative
3       negative
4       negative
          ...   
8779    negative
8780    negative
8781    negative
8782    negative
8783    negative
Name: airline_sentiment, Length: 8784, dtype: object

In [11]:
#from sklearn.model_selection import train_test_split
#primary_feature = sklearn.feature_selection.chi2(matrix_input, lable_vector)

#primary_feature


from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 
  
#Two features with highest chi-squared statistics are selected 
chi2_features = SelectKBest(chi2, k = 100) 
#X_kbest_features = chi2_features.fit_transform(matrix_input_pd, lable_vector) 
X_kbest_features = chi2_features.fit_transform(matrix_input_pd, lable_vector)

X_kbest_features.shape
# Reduced features 
Reduced_features = matrix_input_pd.columns[chi2_features.get_support()]

Reduced_features

Index(['appreci', 'flight', 'day', 'lost', 'guy', 'suck', 'peopl', 'n't',
       'cancel', 'rebook', 'wait', 'minut', 'hold', 'fail', 'need', 'miss',
       'bag', 'say', 'servic', 'daili', 'avgeek', 'due', 'time', 'get', 'min',
       ''ve', 'thank', 'respons', 'hour', 'one', 'still', 'ûïat_us', 'experi',
       'best', 'ever', 'disappoint', 'never', 'tri', 'said', 'flightl',
       'excit', 'told', 'rude', 'issu', 'delay', 'plane', 'good', 'much',
       'phone', 'gate', 'agent', 'excel', 'crew', 'unaccept', 'call', 'custom',
       'job', 'answer', 'luggag', 'sit', 'love', 'great', 'baggag', 'worst',
       'hotel', 'thx', 'follow', 'late', 'system', 'worri', 'ceo', 'battl',
       'wall', 'street', 'ridicul', 'bad', 'fleet', 'fleek', 'line', 'view',
       'awesom', 'kudo', 'fantast', 'journal', 'stuck', 'deserv', 'quick',
       'outstand', 'southwest', 'vega', 'destinationdragon', 'flightr',
       'beauti', 'passbook', 'rock', 'amaz', 'companion', 'winner', 'smooth',
       'fa

In [12]:
# final featuers

#deleted_features = []
#for feature in train_data.columns:
#    if feature not in test_data.columns:
#        deleted_features.append(feature)
#        train_data.drop(columns=[feature])



for feature in Reduced_features:
    train_data.insert((len(train_data.columns)),feature, matrix_input_pd[feature] ,True) 
    
train_data.head()

Unnamed: 0.1,Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,...,destinationdragon,flightr,beauti,passbook,rock,amaz,companion,winner,smooth,favorit
0,11429,681462729,False,finalized,3,2/25/15 3:07,positive,1.0,,,...,0,0,0,0,0,0,0,0,0,0
1,9717,681461003,False,finalized,3,2/25/15 3:20,negative,1.0,Can't Tell,0.3464,...,0,0,0,0,0,0,0,0,0,0
2,10153,681461443,False,finalized,3,2/25/15 6:20,negative,1.0,Can't Tell,1.0,...,0,0,0,0,0,0,0,0,0,0
3,975,681449647,False,finalized,3,2/25/15 2:14,negative,1.0,Can't Tell,1.0,...,0,0,0,0,0,0,0,0,0,0
4,1511,681450373,False,finalized,3,2/25/15 6:57,negative,1.0,Can't Tell,0.6848,...,0,0,0,0,0,0,0,0,0,0


In [13]:
inf = []

In [14]:
#building inputs for training model
x_train = np.full((len(train_data),len(inf)), 0 ,  dtype=object)
X_train_pd = pd.DataFrame(data = x_train, columns = inf)

X_train_pd.head()

0
1
2
3
4


In [15]:
counter = 0
for feature in inf:
    X_train_pd[feature] = train_data[feature]
    counter = counter+1

    
X_train_pd.head()

0
1
2
3
4


In [16]:
for feature in Reduced_features:
    X_train_pd.insert((len(X_train_pd.columns)),feature, matrix_input_pd[feature] ,True) 
    
y_train = train_data["airline_sentiment"]

#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

 
#fill nan values

X_train_pd.fillna(0)

X_train_pd.head()

Unnamed: 0,appreci,flight,day,lost,guy,suck,peopl,n't,cancel,rebook,...,destinationdragon,flightr,beauti,passbook,rock,amaz,companion,winner,smooth,favorit
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Train the model using the training sets
X_train_pd.to_numpy()
model.fit(X_train_pd.astype(np.float),y_train)

GaussianNB()

In [18]:
#building inputs for test model
x_test = np.full((len(test_data),len(inf)), 0)
X_test_pd = pd.DataFrame(data = x_test, columns = inf)

#counter = 0
#for feature in inf:
#    X_test_pd[counter] = test_data[feature]
#    counter = counter+1

#building inputs for training model
x_test = np.full((len(test_data),len(inf)), 0)
X_test_pd = pd.DataFrame(data = x_test, columns = inf)

for feature in Reduced_features:
    X_test_pd.insert((len(X_test_pd.columns)),feature, matrix_input_pd[feature] ,True) 

y_true = test_data["airline_sentiment"]

y_pred = model.predict(X_test_pd)


In [19]:
# evaluation
from sklearn.preprocessing import label_binarize
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_true, y_pred))

precision = precision_score(y_true, y_pred,average='weighted')

#average_precision = average_precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred,average='macro')
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_micro = f1_score(y_true, y_pred, average='micro')


Accuracy: 0.3777322404371585


In [20]:
precision

0.4545745245058099

In [21]:
recall

0.3292968520602679

In [22]:
f1_macro

0.3064248811602481

In [23]:
f1_micro

0.3777322404371585

In [24]:
#confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred, labels=["positive", "negative", "netural"])

array([[ 50, 205,   0],
       [164, 761,   0],
       [  0,   0,   0]])