# Twitter Sentiment Analysis

In [None]:
# import string
import nltk
from nltk.corpus import wordnet,stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV,cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
training_data=pd.read_csv('training_twitter_x_train.csv')

In [3]:
test_text = pd.read_csv('training_twitter_x_train.csv')['text']

In [4]:
texts = training_data['text']
Y_train = training_data['airline_sentiment']

In [5]:
X =[]
X_test = []
for text in texts:
    X.append(word_tokenize(text))

for text in test_text:
    X_test.append(word_tokenize(text))

In [6]:
stopwords = stopwords.words('english') + list(string.punctuation)
lemmatizer = WordNetLemmatizer()

In [7]:
def clean_doc(doc):
    clean_words=[]
    for word in doc:
        if word.lower() not in stopwords and ( word.isalpha()):
            clean_words.append( lemmatizer.lemmatize( word , simple(pos_tag([word])[0][1])).lower())
    return clean_words

In [8]:
def simple(tag):
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('A'):
        return wordnet.ADV
    if tag.startswith('J'):
        return wordnet.ADJ
    else:
        return wordnet.NOUN

In [9]:
X_train= [ ' '.join( clean_doc(doc)) for doc in X ]
X_test= [ ' '.join( clean_doc(doc)) for doc in X_test ]

In [10]:
X_test[0]

'southwestair schedule morning day fact yes sure even flight one cancelled flightled'

In [11]:
Y_train = Y_train.replace({
    'negative' : 0,
    'neutral' : 1,
    'positive' : 2,
})
Y_train

0        0
1        2
2        2
3        0
4        0
        ..
10975    1
10976    2
10977    0
10978    0
10979    0
Name: airline_sentiment, Length: 10980, dtype: int64

In [14]:
cv = CountVectorizer(max_features=3000)
x_train_features=cv.fit_transform(X_train)

In [15]:
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
from sklearn.svm import SVC
svc=SVC()

In [21]:
cross_val_score(svc,x_train_features,Y_train).mean()

0.7761384335154827

In [22]:
from sklearn.naive_bayes import MultinomialNB
model2 = MultinomialNB()
cross_val_score(model2,x_train_features,Y_train).mean()

0.7645719489981786

In [41]:
from sklearn.linear_model import LogisticRegression
model3 = LogisticRegression(max_iter=2000,multi_class='ovr')
cross_val_score(model3,x_train_features,Y_train).mean()

0.785792349726776

In [40]:
from sklearn.ensemble import RandomForestClassifier
model4 = RandomForestClassifier()
cross_val_score(model4,x_train_features,Y_train).mean()

0.7555555555555556

In [43]:
model3.fit(x_train_features,Y_train)

In [53]:
X_test_transform=cv.transform(X_test)

In [54]:
X_test_transform

<10980x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 94857 stored elements in Compressed Sparse Row format>

In [56]:
prediction=model3.predict(X_test_transform)

In [57]:
prediction = pd.Series(prediction).replace({
    0:'negative',
    1:'neutral',
    2:'positive'
})
prediction

0        negative
1        positive
2        positive
3         neutral
4        negative
           ...   
10975     neutral
10976    positive
10977    negative
10978    negative
10979    negative
Length: 10980, dtype: object