# LOADING DATASET AND PERFORMING PREPROCESSING:

In [82]:
import numpy as np
import pandas as pd

In [33]:
training_data=pd.read_csv(r'C:\Users\hp\Documents\0000000000002747_training_twitter_x_y_train.csv')

In [34]:
print(set(training_data['airline_sentiment']))
#our features set

{'positive', 'neutral', 'negative'}


In [35]:
training_data=training_data[['text','airline_sentiment']]
training_data[0:5]

Unnamed: 0,text,airline_sentiment
0,"@SouthwestAir I am scheduled for the morning, ...",negative
1,@SouthwestAir seeing your workers time in and ...,positive
2,@united Flew ORD to Miami and back and had gr...,positive
3,@SouthwestAir @dultch97 that's horse radish 😤🐴,negative
4,@united so our flight into ORD was delayed bec...,negative


In [36]:
training_data=training_data.values

In [37]:
training_data=np.array(training_data)
training_data[0]

array(['@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled',
       'negative'], dtype=object)

In [40]:
#getting the data into nltk suitable format:
from nltk.tokenize import sent_tokenize,word_tokenize
dtrain=[(word_tokenize(doc),category) for doc,category in training_data ]


In [45]:
for i in range(5):
    print(dtrain[i])

(['@', 'SouthwestAir', 'I', 'am', 'scheduled', 'for', 'the', 'morning', ',', '2', 'days', 'after', 'the', 'fact', ',', 'yes..not', 'sure', 'why', 'my', 'evening', 'flight', 'was', 'the', 'only', 'one', 'Cancelled', 'Flightled'], 'negative')
(['@', 'SouthwestAir', 'seeing', 'your', 'workers', 'time', 'in', 'and', 'time', 'out', 'going', 'above', 'and', 'beyond', 'is', 'why', 'I', 'love', 'flying', 'with', 'you', 'guys', '.', 'Thank', 'you', '!'], 'positive')
(['@', 'united', 'Flew', 'ORD', 'to', 'Miami', 'and', 'back', 'and', 'had', 'great', 'crew', ',', 'service', 'on', 'both', 'legs', '.', 'THANKS'], 'positive')
(['@', 'SouthwestAir', '@', 'dultch97', 'that', "'s", 'horse', 'radish', '😤🐴'], 'negative')
(['@', 'united', 'so', 'our', 'flight', 'into', 'ORD', 'was', 'delayed', 'because', 'of', 'Air', 'Force', 'One', ',', 'but', 'the', 'last', 'flight', 'to', 'SBN', 'is', 'at', '8:20', ',', '5', 'mins', 'from', 'now', 'we', 'just', 'landed', '.'], 'negative')


In [42]:
#therefore we have got the training data in nltk suitable format.

# DATA CLEANING:

In [46]:
#getting all stopwords and punctuations:
from nltk.corpus import stopwords
import string
stops=set(stopwords.words('english'))
punctuations=string.punctuation
stops.update(punctuations)

In [47]:
from nltk import pos_tag

In [48]:
#defining function which will convert pos_tag parts of speech into wordnet part of speech:
from nltk import WordNetLemmatizer as wnl
from nltk.corpus import wordnet
lemmatizer=wnl()
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [50]:
#defining function to clean documents:
def clean_review(words):
    output_words=[]
    for w in words:
        if w.lower() not in stops:
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [51]:
dtrain=[(clean_review(document),category) for document,category in dtrain]

In [66]:
# therefore now our training data set is completely clean and ready.

# getting our x_train and y_train ready for count vectorizer:

In [54]:
y_train=[category for document,category in dtrain]

In [56]:
x_train=[' '.join(document) for document,category in dtrain]#since count_vectorizer takes array of sentence and not array of words.

# getting test data alson ready for count vectorizer:

In [93]:
testing_data=pd.read_csv(r'C:\Users\hp\Documents\0000000000002747_test_twitter_x_test.csv')

In [94]:
testing_data.head()

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)


In [95]:
testing_data=np.array(testing_data['text'])

In [96]:
testing_data[0]

"@AmericanAir In car gng to DFW. Pulled over 1hr ago - very icy roads. On-hold with AA since 1hr. Can't reach arpt for AA2450. Wat 2 do?"

In [97]:
dtest=[(word_tokenize(doc)) for doc in testing_data ]

In [98]:
dtest=[(clean_review(document)) for document in dtest]

In [100]:
x_test=[' '.join(document) for document in dtest]

# applying count  vectorizer:

In [101]:
#we have our x_train,y_train and x_test ready.

In [102]:
from sklearn.feature_extraction.text import CountVectorizer as cv

In [103]:
count_vec=cv(max_features=3000,ngram_range=(1,2))
train_data=count_vec.fit_transform(x_train)

In [104]:
test_data=count_vec.transform(x_test)

In [105]:
#now we have our dataset ready in matrix format and we can easily apply sklearn classifiers on them.

# applying sklearn classifiers:

SUPPORT VECTOR MACHINE:

In [106]:
from sklearn.svm import SVC
svc=SVC()

In [107]:
svc.fit(train_data,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [109]:
y_pred_svm=svc.predict(test_data)

In [110]:
df=pd.DataFrame(y_pred_svm)
df.to_csv('predictions_svm_twitter_sentimental.csv', index = False, header = False)

MULTINOMIAL NAIVE BAYES:

In [111]:
from sklearn.naive_bayes import MultinomialNB as mn
naive_bayes=mn()

In [112]:
naive_bayes.fit(train_data,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [113]:
y_pred_nb=naive_bayes.predict(test_data)

In [114]:
df=pd.DataFrame(y_pred_nb)
df.to_csv('predictions_naive_bayes_twitter_sentimental.csv', index = False, header = False)

RANDOM FOREST

In [115]:
from sklearn.ensemble import RandomForestClassifier as rf

  from numpy.core.umath_tests import inner1d


In [116]:
random_forest=rf()
random_forest.fit(train_data,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [117]:
y_pred_rf=random_forest.predict(test_data)

In [118]:
df=pd.DataFrame(y_pred_nb)
df.to_csv('predictions_random_forest_twitter_sentimental.csv', index = False, header = False)

MULTINOMIAL NAIVE BAYES AND RANDOM FOREST GAVE BEST ACCURACY.