# تحلیل احساسات جریان داده های میکروبلاگ
## Mohammad Hossein malekpour | 9613425
_____________________________________________________________________________________


## Import Necessary Libraries

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

## Load Data

In [2]:
train_data = pd.read_csv("airline-train.csv")
dev_data = pd.read_csv("airline-dev.csv")
test_data = pd.read_csv("airline-test.csv")

In [3]:
data = pd.concat([train_data, dev_data, test_data])

## Pre-Proceess Data

Using CountVectorizer to convert text into tokens/features.
- A built-in stop word list for English is used.
- Convert all characters to lowercase before tokenizing.
- by regex select tokens of 3 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).
- max_df: ignore terms that have a document frequency strictly higher than the given threshold (0.80 represents a proportion of documents).
- min_df: ignore terms that have a document frequency strictly lower than the given threshold.

In [4]:
vect = CountVectorizer(stop_words='english', lowercase = True, token_pattern = r'(?u)\b\w\w\w+\b', ngram_range = (1,1), max_df = .80, min_df = 4)

<br>Splitted training test with test size of 20%

In [5]:
X = data.text
y = data.airline_sentiment
y = y.replace(['positive','neutral', 'negative'],[1, 0, -1])
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size= 0.2)

<br> Using training data to transform text into counts of features for each tweet:

In [6]:
vect.fit(X_train)
X_train_dtm = vect.transform(X_train) 
X_test_dtm = vect.transform(X_test)

## Classifier Comparison
Used the following models to train on training data.
- Naive Bayes
- SVM (Support Vector Machine)
- KNN (K Nearest Neighbors)

and tested models on test data and calculated accuracy of predictions.

Accuracy using Naive Bayes Model:

In [7]:
NB = MultinomialNB()
NB.fit(X_train_dtm, y_train)
y_pred = NB.predict(X_test_dtm)
print('\nNaive Bayes')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Naive Bayes
Accuracy Score: 77.52732240437157%
Confusion Matrix: 
[[1653  150   51]
 [ 255  299   58]
 [  94   50  318]]


<br>Accuracy using SVM Model:

In [8]:
SVM = LinearSVC()
SVM.fit(X_train_dtm, y_train)
y_pred = SVM.predict(X_test_dtm)
print('\nSupport Vector Machine')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Support Vector Machine
Accuracy Score: 77.04918032786885%
Confusion Matrix: 
[[1574  210   70]
 [ 179  361   72]
 [  82   59  321]]


<br>Accuracy using KNN Model:

In [9]:
KNN = KNeighborsClassifier(n_neighbors = 3)
KNN.fit(X_train_dtm, y_train)
y_pred = KNN.predict(X_test_dtm)
print('\nK Nearest Neighbors (NN = 3)')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


K Nearest Neighbors (NN = 3)
Accuracy Score: 52.424863387978135%
Confusion Matrix: 
[[895 875  84]
 [144 419  49]
 [ 89 152 221]]


## Naive Bayes Analysis

In [10]:
tokens_words = vect.get_feature_names()
counts = NB.feature_count_
df_table = {'Token':tokens_words,'Positive': counts[1,:], 'Neutral': counts[0,:], 'Negative': counts[-1,:]}
tokens = pd.DataFrame(df_table, columns= ['Token','Positive','Neutral','Negative'])
positives = len(tokens[tokens['Positive']>tokens['Negative']])
print('\nAnalysis')
print('No. of tokens: ',len(tokens_words))
print('No. of positive tokens: ',positives)
print('No. of negative tokens: ',len(tokens_words)-positives)


Analysis
No. of tokens:  2809
No. of positive tokens:  1418
No. of negative tokens:  1391


## Text summary of the precision, recall, F1 score for each class

In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.79      0.48      0.60      1854
           0       0.29      0.68      0.41       612
           1       0.62      0.48      0.54       462

    accuracy                           0.52      2928
   macro avg       0.57      0.55      0.52      2928
weighted avg       0.66      0.52      0.55      2928



## Test a tweet on the best performing model  (Naive Bayes)

Took custom tweet inputs and predicted Positive/Negative/Neutral tweet.

In [12]:
trainingVector = CountVectorizer(stop_words='english', lowercase = True, token_pattern = r'(?u)\b\w\w\w+\b', ngram_range = (1,1), max_df = .80, min_df = 4)
trainingVector.fit(X)
X_dtm = trainingVector.transform(X)
NB = MultinomialNB()
NB.fit(X_dtm, y)
#Input tweet
print('\nTest a custom review message')
print('Enter review to be analysed: ', end=" ")
test = []
test.append(input())
test_dtm = trainingVector.transform(test)
predLabel = NB.predict(test_dtm)
tags = ['Negative','Positive']
#Display Output
print('The review is predicted',tags[predLabel[0]])


Test a custom review message
Enter review to be analysed:  

 I really appreciate the details of the travel. It was an awesome experience.


The review is predicted Positive
