## Import Libraries

In [66]:
import warnings
warnings.filterwarnings('ignore')

In [67]:
!pip install contractions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re, string, unicodedata
import contractions                                     
from bs4 import BeautifulSoup                           
import nltk  

nltk.download('punkt')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pavanksu2009\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pavanksu2009\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Import Dataset

In [68]:
# Read the file
data=pd.read_csv('Analytical_base_table.csv')

In [71]:
# Print first five rows of dataset
data.head()

Unnamed: 0,text,airline_sentiment
0,virginamerica what dhepburn say,1
1,virginamerica plus you have add commercials ...,2
2,virginamerica i do not today must mean i n...,1
3,virginamerica it be really aggressive to bla...,0
4,virginamerica and it be a really big bad thi...,0


In [73]:
# Check the value count of airline_sentiment
data.airline_sentiment.value_counts()

0    9178
1    3099
2    2363
Name: airline_sentiment, dtype: int64

--> "0" represents "negative" sentiments

--> "1" represents "neutral" sentiments

--> "2" represents "positive" sentiments

## Vectorization

In [74]:
## Count vectorizer
vectorizer = CountVectorizer(max_features=1000)                # Keep only 1000 features as number of features will increase the processing time.
data_count = vectorizer.fit_transform(data['text'])

data_count = data_count.toarray()                        # Convert the data features to array.
data_count.shape

(14640, 1000)

In [75]:
## TFIDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
data_tfidf = vectorizer.fit_transform(data['text'])

data_tfidf = data_tfidf.toarray()                        # Convert the data features to array.
data_tfidf.shape

(14640, 1000)

## Use Random Forest model

In [76]:
# Get target column
labels = data['airline_sentiment']

# Split data into training and testing set for Count Vector
x_train, x_test, y_train, y_test = train_test_split(data_count, labels, test_size=0.3, random_state=42)

In [77]:
forest = RandomForestClassifier(n_estimators=10, n_jobs=4)
forest = forest.fit(x_train, y_train)
print(forest)

print(np.mean(cross_val_score(forest, data_count, labels, cv=10)))

RandomForestClassifier(n_estimators=10, n_jobs=4)
0.7173497267759563


In [78]:
# Predict the result
result = forest.predict(x_test)

# Print confusion matrix
conf_mat = confusion_matrix(y_test, result)
print(conf_mat)

[[2628  135   51]
 [ 456  363   65]
 [ 249  112  333]]


In [79]:
# Split data into training and testing set for TFIDF Vector
x_train, x_test, y_train, y_test = train_test_split(data_tfidf, labels, test_size=0.3, random_state=42)

In [80]:
forest = RandomForestClassifier(n_estimators=10, n_jobs=4)
forest = forest.fit(x_train, y_train)
print(forest)

print(np.mean(cross_val_score(forest, data_count, labels, cv=10)))

RandomForestClassifier(n_estimators=10, n_jobs=4)
0.7200136612021859


In [31]:
import joblib 
joblib.dump(forest, 'challa_forest.sav')

['challa_forest.sav']

In [32]:
loaded_model = joblib.load('challa_forest.sav')
loaded_model.score(x_test, y_test)

0.7518214936247724

In [30]:
# Predict the result
result = forest.predict(x_test)

# Print confusion matrix
conf_mat = confusion_matrix(y_test, result)
print(conf_mat)

[[2642  120   52]
 [ 480  339   65]
 [ 279   94  321]]


## Summary

* We used the text of the tweet and the sentiment represented by the tweet which could be either positive, negative or neutral.
* The objective was to build a classification model.
* The text pre-processing was done by removing HTML tags, replacing contractions, removing numbers, special characters and punctuations. We also converted text to lower case and used lemmatization for tokenized words.
* This pre-processed data was then converted to numbers using vectorization techniques; count vectorization and TFIDF vectorization to be used for random forest classifier.
* The random forest classifier was used for predicting the results.
* For count vectorization technique, we got a cross validation score of 71.7% whereas for TFIDF vectorization, we got 72% 10 fold cross validation score.
* The performance of the model can be increased by using different classification models or neural networks besides changing the number of features for vectorization and also by using other pre-processing techniques like removing stop words etc.