#### Importing the Required Libraries


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings("ignore")
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

#### Importing Training Data 

In [2]:
df_train=pd.read_csv('0000000000002747_training_twitter_x_y_train.csv')

In [3]:
df_train.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
df_train = df_train[['text', 'airline_sentiment']]

In [5]:
df_train.head()

Unnamed: 0,text,airline_sentiment
0,"@SouthwestAir I am scheduled for the morning, ...",negative
1,@SouthwestAir seeing your workers time in and ...,positive
2,@united Flew ORD to Miami and back and had gr...,positive
3,@SouthwestAir @dultch97 that's horse radish 😤🐴,negative
4,@united so our flight into ORD was delayed bec...,negative


In [6]:
training_data=df_train.values

In [7]:
training_data[:5,:]

array([['@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled',
        'negative'],
       ['@SouthwestAir seeing your workers time in and time out going above and beyond is why I love flying with you guys. Thank you!',
        'positive'],
       ['@united Flew ORD to Miami and back and  had great crew, service on both legs. THANKS',
        'positive'],
       ["@SouthwestAir @dultch97 that's horse radish 😤🐴", 'negative'],
       ['@united so our flight into ORD was delayed because of Air Force One, but the last flight to SBN is at 8:20, 5 mins from now we just landed.',
        'negative']], dtype=object)

#### Spliiting the Tweet text into words using NLTK

In [8]:
tweets_train=[]
for i in range(len(training_data)):
    tweets_train.append([word_tokenize(training_data[i][0]), training_data[i][1]])

##### Cleaning the Words using WordNetLemmatizer available in NLTK 

In [9]:
stop_words=stopwords.words("english")
punctuations=list(string.punctuation)
stop_words+=punctuations

In [10]:
def simple_pos(tag):
    if tag.startswith("J"):
        return wordnet.ADJ
    elif tag.startswith("V"):
        return wordnet.VERB
    elif tag.startswith("R"):
        return wordnet.ADV
    elif tag.startswith("N"):
        return wordnet.NOUN
    else:
        return wordnet.NOUN

In [11]:
lemmatizer=WordNetLemmatizer()

In [12]:
def clean_tweets(words):
    op=[]
    for word in words:
        if word.isalpha():
            if word.lower() not in stop_words:
                lemmatized_word=lemmatizer.lemmatize(word,simple_pos(pos_tag([word])[0][1]))
                op.append(lemmatized_word)
    return op

In [13]:
cleaned_tweets=[(clean_tweets(word),cat) for word,cat in tweets_train]

In [14]:
y_train=[]
tweets=[]
for tweet,sentiment in cleaned_tweets:
    tweets.append(" ".join(tweet))
    y_train.append(sentiment)

##### Using Count Vectorizer to get the X Train 

In [15]:
cv=CountVectorizer(max_features=2000,min_df=0.002,max_df=0.95)
x_train_features=cv.fit_transform(tweets)

##### Prepaing Testing Data 

In [16]:
df_test = pd.read_csv("0000000000002747_test_twitter_x_test.csv")

In [17]:
df_test.head()

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)


In [18]:
test=df_test["text"]

In [19]:
test=test.values

In [20]:
tweets_test = []
for t in test:
    t = clean_tweets(word_tokenize(t))
    tweets_test.append(" ".join(t))

In [21]:
x_test_features = cv.transform(tweets_test)

##### Performing Classification

In [22]:
svc = SVC()
svc.fit(x_train_features, y_train)

SVC()

In [23]:
y_pred_svm = svc.predict(x_test_features)

In [24]:
df = pd.DataFrame(y_pred_svm)

In [25]:
df.head()

Unnamed: 0,0
0,negative
1,negative
2,negative
3,negative
4,positive


In [27]:
df.to_csv('predictions_svm.csv', index = False, header = False)

#### Random Forest

In [28]:
rfc = RandomForestClassifier()
rfc.fit(x_train_features, y_train)

RandomForestClassifier()

In [30]:
y_pred_rfc = rfc.predict(x_test_features)

In [32]:
df2 = pd.DataFrame(y_pred_rfc)

In [33]:
df2.head()

Unnamed: 0,0
0,negative
1,negative
2,negative
3,negative
4,negative


In [34]:
df2.to_csv('predictions_rfc.csv', index = False, header = False)

#### Multinomial Naive Bayes

In [35]:
mnv = MultinomialNB(alpha = 1)
mnv.fit(x_train_features, y_train)

MultinomialNB(alpha=1)

In [36]:
y_pred_mnv = mnv.predict(x_test_features)

In [38]:
df3 = pd.DataFrame(y_pred_mnv)

In [39]:
df3.to_csv('predictions_mnv.csv', index = False, header = False)

###### The prediction output of all the above classifiers were tested in the Coding Ninjas Website. It was found that Multinomial Naive Bayes was performing the best among the above classifiers.