In [94]:
import pandas as pd
import numpy as np

In [95]:
data = pd.read_csv('TestData.csv', encoding = "ISO-8859-1" )

In [96]:
data

Unnamed: 0,Review,Sentiment
0,"The app worked great for awhile, but now I can...",Negative
1,The app used to work very well. Recently (sinc...,Negative
2,The app crashed right before I had to scan my ...,Negative
3,Delta's app and website are terrible. It never...,Negative
4,Really cool app. Used it for my trip to and fr...,Positive
...,...,...
124,Probably one of the best airline apps availabl...,Negative
125,I have to take another star away from this app...,Negative
126,When I initially installed the app I would hav...,Negative
127,Terrible experience!!!!! All the information I...,Negative


In [97]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 2 columns):
Review       129 non-null object
Sentiment    129 non-null object
dtypes: object(2)
memory usage: 2.1+ KB


In [98]:
data.nunique()

Review       129
Sentiment      2
dtype: int64

In [99]:
from bs4 import BeautifulSoup

In [100]:
import re
import tqdm
import unicodedata

In [101]:
# build train and test datasets
reviews = data['Review'].values
sentiments = data['Sentiment'].values

train_reviews = reviews[:100]
train_sentiments = sentiments[:100]

test_reviews = reviews[100:]
test_sentiments = sentiments[100:]

In [102]:
def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

In [2]:
def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

In [104]:
def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = strip_html_tags(doc)
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = remove_accented_chars(doc)
    #doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()  
    norm_docs.append(doc)
  
  return norm_docs

In [105]:
%%time

norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

100%|██████████| 100/100 [00:00<00:00, 3851.70it/s]
100%|██████████| 29/29 [00:00<00:00, 7266.55it/s]


Wall time: 43.8 ms


In [106]:
norm_train_reviews

['the app worked great for awhile but now i cant even look into booking a flight if you put in an airport in the from and to sections it stays blank the number one feature being able to book or check out potential rates and prices is unusable the app was never great but if it fails to perform its primary function then it is trash just use the website',
 'the app used to work very well recently since jan 2020 it is prone to glitches and ghost notifications check in gets stuck and abandon the app to check in on website if first leg of flight requires checking in on partner airline klm the app wont get past the check in and is worthless for the rest of the trip baggage notifications work so 1 extra star for that but other ghost notifications are annoyances attempts to work out issues with delta chat reps had no solution bummer',
 'the app crashed right before i had to scan my boarding pass it also wiped out all of my trips tried logging out and could not log back in tried resetting passwo

In [107]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [108]:

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)


# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

In [109]:
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

In [110]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (100, 219)  Test features shape: (29, 219)
TFIDF model:> Train features shape: (100, 219)  Test features shape: (29, 219)


In [111]:
# Logistic Regression model on BOW features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)

# train model
lr.fit(cv_train_features, train_sentiments)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [112]:
# predict on test data
lr_bow_predictions = lr.predict(cv_test_features)

In [137]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['negative', 'positive']
print(classification_report(test_sentiments, lr_bow_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, lr_bow_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

    Negative       0.96      0.93      0.95        28
    Positive       0.00      0.00      0.00         1

    accuracy                           0.90        29
   macro avg       0.48      0.46      0.47        29
weighted avg       0.93      0.90      0.91        29



Unnamed: 0,negative,positive
negative,26,2
positive,1,0


In [165]:
DeltaTest = pd.read_csv('FlyDelta_Review_1.csv')
AirCanadaTest = pd.read_csv('AirCanada.csv', encoding = "ISO-8859-1")

In [170]:
DeltaTest['Sentiment'].unique()

array(['Negative', 'Positive'], dtype=object)

In [167]:
DeltaTest['Sentiment'] = DeltaTest['Sentiment'].fillna(DeltaTest['Sentiment'].mode()[0])

In [169]:
DeltaTest.Sentiment.replace({"Negative ":"Negative"},inplace=True)

In [171]:
Testdata = pd.concat([DeltaTest,AirCanadaTest],axis=0,sort=False,ignore_index=True)

In [172]:
Testdata

Unnamed: 0,Review,Sentiment
0,It is not user friendly. It doesn't go to the ...,Negative
1,I am disappointed with this app! good thing i ...,Negative
2,I think Delta has a great app. Thanks Delta!,Positive
3,Garbage - keeps losing my trips when I'm logge...,Negative
4,The app should display your upcoming flight de...,Negative
...,...,...
300,What a horrible update! Trading updated color ...,Negative
301,It would not download. I had a previous versio...,Negative
302,Updated app: Have had to re-enter same informa...,Negative
303,The updated app as a serious problem with the ...,Negative


In [173]:
Testreviews = Testdata['Review'].values

In [174]:
Testdata['Sentiment'].unique()

array(['Negative', 'Positive'], dtype=object)

In [175]:
Testdata['Sentiment'].unique()

array(['Negative', 'Positive'], dtype=object)

In [176]:
Testsentiments = Testdata['Sentiment'].values

In [177]:
Testdata['Sentiment'].isna().value_counts()

False    305
Name: Sentiment, dtype: int64

In [178]:
norm_testreviews = pre_process_corpus(Testreviews)

100%|██████████| 305/305 [00:00<00:00, 5083.18it/s]


In [179]:
norm_testreviews

['it is not user friendly it doesnt go to the next screen properly sometimes makes using it a challenge i have tried to make a reservation several times but when i select a flight it takes me to another screen wants me to upgrade i cant get out of that upgrade screen to move forward ergo i may have to switch airlines to book my trip',
 'i am disappointed with this app good thing i had screenshot my boarding pass went to the airport and went to the original gate only to find out that they have switched to another gate i could not get this new information because every time i click to view boarding pass app crashes',
 'i think delta has a great app thanks delta',
 'garbage keeps losing my trips when im logged in',
 'the app should display your upcoming flight details other airlines show your bookings in the app this app requires you to enter the booking reference first to find your booking which means you have to store that info elsewhere should be a one stop shop for your delta flight',

In [180]:
cv_new_test_features = cv.transform(norm_testreviews)
tv_new_test_features = tv.transform(norm_testreviews)

In [63]:
print('BOW model:>  New Test features shape:', cv_new_test_features.shape)
print('TFIDF model:>  New Test features shape:', tv_new_test_features.shape)

BOW model:>  New Test features shape: (305, 219)
TFIDF model:>  New Test features shape: (305, 219)


In [181]:
cv_new_test_features

<305x219 sparse matrix of type '<class 'numpy.int64'>'
	with 7895 stored elements in Compressed Sparse Row format>

In [182]:
# predict on new test data
lr_new_predictions = lr.predict(cv_new_test_features)

In [183]:
lr_new_predictions

array(['Negative', 'Negative', 'Positive', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Positive', 'Negative',
       'Negative', 'Negative', 'Positive', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Negative', 'Negative', 'Positive', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Negative', 'Positive', 'Negative', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Negative', 'Positive',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Positive', 'Negative',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negati

In [128]:
test_pred = pd.DataFrame(lr_new_predictions,columns= ["Sentiment"])

In [129]:
pred_result = pd.concat([Testdata[["Review"]],test_pred] ,axis = 1)

In [131]:
pred_result['Sentiment'].nunique()

2

In [184]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['negative', 'positive']
print(classification_report(Testsentiments, lr_new_predictions))
pd.DataFrame(confusion_matrix(Testsentiments, lr_new_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

    Negative       0.93      0.91      0.92       282
    Positive       0.17      0.22      0.19        23

    accuracy                           0.86       305
   macro avg       0.55      0.56      0.56       305
weighted avg       0.88      0.86      0.87       305



Unnamed: 0,negative,positive
negative,257,25
positive,18,5


In [28]:
# Logistic Regression model on TF-IDF features

# train model
lr.fit(tv_train_features, train_sentiments)

# predict on test data
lr_tfidf_predictions = lr.predict(tv_test_features)

In [29]:
labels = ['negative', 'positive']
print(classification_report(test_sentiments, lr_tfidf_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, lr_tfidf_predictions), index=labels, columns=labels)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

    Negative       0.97      1.00      0.98        28
    Positive       0.00      0.00      0.00         1

    accuracy                           0.97        29
   macro avg       0.48      0.50      0.49        29
weighted avg       0.93      0.97      0.95        29



Unnamed: 0,negative,positive
negative,28,0
positive,1,0


In [30]:
# Random Forest model on BOW features
from sklearn.ensemble import RandomForestClassifier

# instantiate model
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

# train model
rf.fit(cv_train_features, train_sentiments)

# predict on test data
rf_bow_predictions = rf.predict(cv_test_features)

In [31]:
labels = ['negative', 'positive']
print(classification_report(test_sentiments, rf_bow_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, rf_bow_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

    Negative       0.96      0.96      0.96        28
    Positive       0.00      0.00      0.00         1

    accuracy                           0.93        29
   macro avg       0.48      0.48      0.48        29
weighted avg       0.93      0.93      0.93        29



Unnamed: 0,negative,positive
negative,27,1
positive,1,0


In [32]:
# Random Forest model on TF-IDF features

# train model
rf.fit(tv_train_features, train_sentiments)

# predict on test data
rf_tfidf_predictions = rf.predict(tv_test_features)

In [33]:
labels = ['negative', 'positive']
print(classification_report(test_sentiments, rf_tfidf_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, rf_tfidf_predictions), index=labels, columns=labels)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

    Negative       0.97      1.00      0.98        28
    Positive       0.00      0.00      0.00         1

    accuracy                           0.97        29
   macro avg       0.48      0.50      0.49        29
weighted avg       0.93      0.97      0.95        29



Unnamed: 0,negative,positive
negative,28,0
positive,1,0
