# Import Libraries

In [1]:
# data related
import pandas as pd
import numpy as np
import seaborn as sb

# general libraries
import pickle
import time
from copy import deepcopy

# display libraries
from IPython.core.display import display, HTML
pd.set_option("display.max_rows", 8000)
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_colwidth', -1)
display(HTML("<style>.container { width:99% !important; }</style>"))

  from IPython.core.display import display, HTML
  pd.set_option('display.max_colwidth', -1)


# Save Files

In [65]:
def dump_file(folder_path, file_name, file_to_dump):
    import pickle
    with open(f'{folder_path}/{file_name}.pkl', 'wb') as file:
        pickle.dump(file_to_dump, file)

In [66]:
dump_file(folder_path='IE4483_Mini_Project', file_name='df_train', file_to_dump=df_train)
dump_file(folder_path='IE4483_Mini_Project', file_name='lr_regressors_smote', file_to_dump=lr_regressors_smote)
dump_file(folder_path='IE4483_Mini_Project', file_name='lr_regressors_ros', file_to_dump=lr_regressors_ros)

# Read Saved Files

An easy way is to run the below cells to get all the required files

In [67]:
def open_file(folder_path, file_name):
    import pickle
    with open(f'{folder_path}/{file_name}.pkl', 'rb') as file:
        saved_file = pickle.load(file)
    return saved_file

file_names = ['df_train', 'df_test', 'lr_regressors_smote', 'lr_regressors_ros']
df_train, df_test, lr_regressors_smote, lr_regressors_ros = open_file('IE4483_Mini_Project', file_names[0]), open_file('IE4483_Mini_Project', file_names[1]), open_file('IE4483_Mini_Project', file_names[2]), open_file('IE4483_Mini_Project', file_names[3])

# The problem at hand

- Objective: given a set of reviews, predict whether it is a positive (1) or negative (0) review
- This is a text classification problem
- this can be seen as a binary classification problem and LogisticRegression can be used

# Training Data Exploration

- 1 means positive, 0 negative

In [2]:
df_train = pd.read_json('IE4483_Mini_Project/train.json')

Here we see that there are more positive reviews than negative reviews, resulting in an unbalanced dataset \
We consider two techniques to over sample the negative class later on:
1. SMOTE (synthetic minority oversampling technique)
2. RandomOverSampling

In [3]:
df_train['sentiments'].value_counts()

1    6319
0    1082
Name: sentiments, dtype: int64

In [4]:
df_train[0:10]

Unnamed: 0,reviews,sentiments
0,I bought this belt for my daughter in-law for Christmas and she loved it,1
1,The size was perfect and so was the color. It looked just like it did on the web page:),1
2,"Fits and feels good, esp. for doing a swim race. Highly recommend this. \n\nBe careful when wearing board shorts with velcro zipper straps over this material. The velcro from the board shorts tend to stick very easily to the material of this swim trunks. Have to be careful when removing the velcro from any shorts or pants. I had this happen and the velcro almost damaged the swim trunks when I was removing it.",1
3,"These socks are absolutely the best. I take pilates classes and if it is hot your feet can slip off the mat. (Not good in the middle of a side plank!)Anyway these socks keep your feet planted and dry. One note; make sure not to order them big, the idea is that they should be snug",1
4,Thank you so much for the speedy delivery they came in time for the rehearsal dinner. I loved the red silk neck ties and they really made all the ushers stand out. They made very nice thank you gifts. We were all pleased.\nmarjorie300,1
5,"I bought this shoe in black and bronze. I do a lot of running around and this is a very comfortable shoe, true to size and really cute",1
6,"These shorts are great for all around use. They look great and are comfortable for swimming or just wearing around. I am in college and wear these to class as they are lightweight and still have enough pockets to carry everything I need. they are alos stiffer than a lot of swim trunks, so they don't flop around too bad if your pockets are ful",1
7,"They look great. I just tried them on and they are already coming apart. It's too bad, 'cause they look great",0
8,"Very comfortable, but the straps keep coming loose.",1
9,"trans. was great. love the shirt!! I know it's a lot for a t-shirt, but I had to have it!!",1


# Training Data - Data Cleaning

There are known steps that we can follow to improve the quality of training data for our model \
We create a function to apply the same set of steps to clean the reviews data

# Create the cleaner function

In [5]:
def text_cleaner(text):
    """
    This functions takes in a text review and does the following steps:
    1. Decode extra \ before any unicode character (emojis)
    2. Handle apostrophe before mapping contractions, replace double quotation with single quotation marks
    3. Map contraction to the original form. e.g. I've -> I Have
    4. Parse text with spacy
    5. Filter punctuations, white space, numbers, URL using spacy methods
    6. Lemmatize each spacy token using .lemma_
    7. Remove any special characters using regular expressions
    8. Remove single length spacy tokens
    9. Spell correction dealing with repeated characters
    """
    
    import codecs
    import unidecode
    import re
    import spacy
    nlp = spacy.load("en_core_web_sm")
    
    contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                       "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" } 
    
    # decoding: unicode_escape for the extra "\" before unicode character, then unidecode
    try:
        decoded = unidecode.unidecode(codecs.decode(text, 'unicode_escape'))    
    except:
        decoded = unidecode.unidecode(text)
    
    # handling Apostrophe, people usually use single quote ' or double quotes " for contraction. We remove double quotation contractions so that we can map it to the correct words later on
    apostrophe_handled = re.sub("’", "'", decoded)
    
    # contraction check, check for any contraction form, and replace with its original form
    expanded = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in apostrophe_handled.split(" ")])
    
    # parse words with  spacy
    parsed = nlp(expanded)
    
    # ============= using spacy library, we can filter out puncutation, white space, numbers to improve quality of content in reviews =============
    final_tokens = []
    for t in parsed:
        # over here for each word in the sentence, we skip numbers, punctuation and white space
        if t.is_punct or t.is_space or t.like_num or t.like_url:
            pass
        else:
            if t.lemma_== '-PRON-':
                # we keep the words that are prononuns here since spacy transforms every pronoun to -PRON-
                final_tokens.append(str(t))
            else:
                # remove special characters using regex
                sc_removed = re.sub("[^a-zA-Z]", '', str(t.lemma_))
                # we check here if there exists any string once we subsitute the characters with blanks. If len = 1 it means it contains special characters
                if len(sc_removed) > 1:
                    final_tokens.append(sc_removed)

    # join back the list of words into a string
    joined = ' '.join(final_tokens)

    # we further correct spelling by shortening repetition of words. example "soooo good" will be transformed into "soo good". not a perfect solution but can be worked on
    spell_corrected = re.sub(r'(.)\1+', r'\1\1', joined)
    
    return spell_corrected

In [97]:
# run cleaner function on dataset
%%time
reviews_cleaned = []
reviews = list(df_train['reviews'])
count = 0
total = len(df_train['reviews'])

for r in reviews:
    reviews_cleaned.append(text_cleaner(r))
    count+=1
    print(count, "/", total)

1 / 7401
2 / 7401
3 / 7401
4 / 7401
5 / 7401
6 / 7401
7 / 7401
8 / 7401
9 / 7401
10 / 7401
11 / 7401
12 / 7401
13 / 7401
14 / 7401
15 / 7401
16 / 7401
17 / 7401
18 / 7401
19 / 7401
20 / 7401
21 / 7401
22 / 7401
23 / 7401
24 / 7401
25 / 7401
26 / 7401
27 / 7401
28 / 7401
29 / 7401
30 / 7401
31 / 7401
32 / 7401
33 / 7401
34 / 7401
35 / 7401
36 / 7401
37 / 7401
38 / 7401
39 / 7401
40 / 7401
41 / 7401
42 / 7401
43 / 7401
44 / 7401
45 / 7401
46 / 7401
47 / 7401
48 / 7401
49 / 7401
50 / 7401
51 / 7401
52 / 7401
53 / 7401
54 / 7401
55 / 7401
56 / 7401
57 / 7401
58 / 7401
59 / 7401
60 / 7401
61 / 7401
62 / 7401
63 / 7401
64 / 7401
65 / 7401
66 / 7401
67 / 7401
68 / 7401
69 / 7401
70 / 7401
71 / 7401
72 / 7401
73 / 7401
74 / 7401
75 / 7401
76 / 7401
77 / 7401
78 / 7401
79 / 7401
80 / 7401
81 / 7401
82 / 7401
83 / 7401
84 / 7401
85 / 7401
86 / 7401
87 / 7401
88 / 7401
89 / 7401
90 / 7401
91 / 7401
92 / 7401
93 / 7401
94 / 7401
95 / 7401
96 / 7401
97 / 7401
98 / 7401
99 / 7401
100 / 7401
101 / 74

In [102]:
df_train['reviews_cleaned'] = reviews_cleaned

In [72]:
# assigning pos to 1, neg to 0, for easy reference
df_train['sentiments_classified'] = df_train['sentiments'].apply(lambda x: "pos" if x == 1 else "neg")
df_train[0:10]

Unnamed: 0,reviews,sentiments,reviews_cleaned,sentiments_classified
0,I bought this belt for my daughter in-law for Christmas and she loved it,1,buy this belt for my daughter in law for Christmas and she love it,pos
1,The size was perfect and so was the color. It looked just like it did on the web page:),1,the size be perfect and so be the color it look just like it do on the web page,pos
2,"Fits and feels good, esp. for doing a swim race. Highly recommend this. \n\nBe careful when wearing board shorts with velcro zipper straps over this material. The velcro from the board shorts tend to stick very easily to the material of this swim trunks. Have to be careful when removing the velcro from any shorts or pants. I had this happen and the velcro almost damaged the swim trunks when I was removing it.",1,fit and feel good esp for do swim race highly recommend this be careful when wear board short with velcro zipper strap over this material the velcro from the board short tend to stick very easily to the material of this swim trunk have to be careful when remove the velcro from any short or pant have this happen and the velcro almost damage the swim trunk when be remove it,pos
3,"These socks are absolutely the best. I take pilates classes and if it is hot your feet can slip off the mat. (Not good in the middle of a side plank!)Anyway these socks keep your feet planted and dry. One note; make sure not to order them big, the idea is that they should be snug",1,these sock be absolutely the good take pilate class and if it be hot your foot can slip off the mat not good in the middle of side plankanyway these sock keep your foot plant and dry note make sure not to order they big the idea be that they should be snug,pos
4,Thank you so much for the speedy delivery they came in time for the rehearsal dinner. I loved the red silk neck ties and they really made all the ushers stand out. They made very nice thank you gifts. We were all pleased.\nmarjorie300,1,thank you so much for the speedy delivery they come in time for the rehearsal dinner love the red silk neck tie and they really make all the usher stand out they make very nice thank you gift we be all pleased marjorie,pos
5,"I bought this shoe in black and bronze. I do a lot of running around and this is a very comfortable shoe, true to size and really cute",1,buy this shoe in black and bronze do lot of run around and this be very comfortable shoe true to size and really cute,pos
6,"These shorts are great for all around use. They look great and are comfortable for swimming or just wearing around. I am in college and wear these to class as they are lightweight and still have enough pockets to carry everything I need. they are alos stiffer than a lot of swim trunks, so they don't flop around too bad if your pockets are ful",1,these short be great for all around use they look great and be comfortable for swimming or just wear around be in college and wear these to class as they be lightweight and still have enough pocket to carry everything need they be alo stiff than lot of swim trunk so they do not flop around too bad if your pocket be ful,pos
7,"They look great. I just tried them on and they are already coming apart. It's too bad, 'cause they look great",0,they look great just try they on and they be already come apart it be too bad because they look great,neg
8,"Very comfortable, but the straps keep coming loose.",1,very comfortable but the strap keep come loose,pos
9,"trans. was great. love the shirt!! I know it's a lot for a t-shirt, but I had to have it!!",1,trans be great love the shirt know it be lot for shirt but have to have it,pos


In [73]:
df_train['sentiments_classified'].value_counts()

pos    6319
neg    1082
Name: sentiments_classified, dtype: int64

# Create Model to run imbalanced data

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score

# create the vectorizer, logistic_regression model
vectorizer = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,3))
lr = LogisticRegression()

def logistic_regressor(n_splits, X,Y, pipeline, average_method):
    
    # StratifiedKFold, use this as a tool to improve model performance when dealing with imbalance datasets
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=777)
    
    # create the lists to store the different required information
    lr_regressors, predictions, accuracy, precision, recall, f1 = [], [], [], [], [], []
    
    for train, test in kfold.split(X, Y):
        lr_fit = pipeline.fit(X[train], Y[train])
        prediction = lr_fit.predict(X[test])
        scores = lr_fit.score(X[test], Y[test])
        
        lr_regressors.append(lr_fit)
        predictions.append(prediction)
        accuracy.append(scores * 100)
        precision.append(precision_score(Y[test], prediction, average=average_method)*100)
        print('              negative    positive')
        print('precision:',precision_score(Y[test], prediction, average=None))
        recall.append(recall_score(Y[test], prediction, average=average_method)*100)
        print('recall:   ',recall_score(Y[test], prediction, average=None))
        f1.append(f1_score(Y[test], prediction, average=average_method)*100)
        print('f1 score: ',f1_score(Y[test], prediction, average=None))
        print('-'*50)

    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))
    
    return lr_regressors, predictions

In [7]:
# bulding a pipeline using sklearn to be use for modelling
from sklearn.pipeline import Pipeline
original_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', lr)
])

In [8]:
original_pipeline

We can see that for negative class, he precision is high but the recall is low. If the sentiment is negative, they are really negative. However, it misses a lot of actual negative sentiments. This is mainly due to the fact that the training dataset has a much higher number of positive reviews to negative reviews

In [9]:
# trying to predict with existing model using a toy dataset
test_string_1 = "This shoe is extremely bad!"
test_string_2 = "This shoe is ver good to wear and comfortable"
test_string_3 = "This shoe is not too bad"
test_string_4 = "This shoe is not so comfortable. It rips apart easily and its not simple to wear"
df_test = pd.Series([test_string_1, test_string_2, test_string_3, test_string_4])

In [13]:
lr_regressors, predictions = logistic_regressor(n_splits = 5, X = df_train.reviews_cleaned, Y = df_train.sentiments_classified, pipeline=original_pipeline, average_method='macro')

              negative    positive
precision: [0.97058824 0.87284036]
recall:    [0.15207373 0.99920886]
f1 score:  [0.26294821 0.9317595 ]
--------------------------------------------------
              negative    positive
precision: [0.97435897 0.87647467]
recall:    [0.17592593 0.99920886]
f1 score:  [0.29803922 0.93382625]
--------------------------------------------------
              negative    positive
precision: [0.94871795 0.87578071]
recall:    [0.1712963  0.99841772]
f1 score:  [0.29019608 0.93308688]
--------------------------------------------------
              negative    positive
precision: [0.94444444 0.87396122]
recall:    [0.15740741 0.99841772]
f1 score:  [0.26984127 0.93205318]
--------------------------------------------------
              negative    positive
precision: [0.82926829 0.87282835]
recall:    [0.15668203 0.99445764]
f1 score:  [0.26356589 0.92968172]
--------------------------------------------------
accuracy: 87.58% (+/- 0.25%)
precision: 90.39

Here we create a set of toy data to check if this model can predict the sentments accurately \
We can see that the model does not do well in predicting negative sentiments because it was predominantly trained using positive reviews

In [14]:
toy_string_1 = "This shoe is extremely bad!"
toy_string_2 = "This shoe is ver good to wear and comfortable"
toy_string_3 = "This shoe is not too bad"
toy_string_4 = "This shoe is not so comfortable It rips apart easily and its not simple to wear uncomfortable as hell"
df_toy = pd.Series([toy_string_1, toy_string_2, toy_string_3, toy_string_4])

In [15]:
# the regressor is always giving positive values because of imbalanced dataset
for n, lr in enumerate(lr_regressors):
    print('logistic regressor',n, "predicted values for toy dataset:", lr.predict(df_toy))

logistic regressor 0 predicted values for toy dataset: ['pos' 'pos' 'pos' 'pos']
logistic regressor 1 predicted values for toy dataset: ['pos' 'pos' 'pos' 'pos']
logistic regressor 2 predicted values for toy dataset: ['pos' 'pos' 'pos' 'pos']
logistic regressor 3 predicted values for toy dataset: ['pos' 'pos' 'pos' 'pos']
logistic regressor 4 predicted values for toy dataset: ['pos' 'pos' 'pos' 'pos']


# Deal with imbalance dataset

## SMOTE
We want to try 3 different kinds of oversampling methods for the negative class
1. SMOTE
2. RandomOverSampler

In [16]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler

# create the vectorizer, logistic_regression model
vectorizer = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,3))
lr = LogisticRegression()
# create SMOTE pipeline
smote_pipeline = make_pipeline(vectorizer, SMOTE(random_state=777),lr)

In [17]:
smote_pipeline

We can see a much `higher` recall and precision for the negative class, which is what we desire. High precision and recall for positive class is also present.

### Note: this is the final model we settle on to be use to predict the unobserved test dataset

In [18]:
%%time
lr_regressors_smote, predictions_smote = logistic_regressor(n_splits = 5, X = df_train.reviews_cleaned, 
                                                            Y = df_train.sentiments_classified, pipeline=smote_pipeline, 
                                                            average_method='macro')

              negative    positive
precision: [0.70612245 0.96440129]
recall:    [0.79723502 0.94303797]
f1 score:  [0.74891775 0.9536    ]
--------------------------------------------------
              negative    positive
precision: [0.69626168 0.94707741]
recall:    [0.68981481 0.94857595]
f1 score:  [0.69302326 0.94782609]
--------------------------------------------------
              negative    positive
precision: [0.67206478 0.9594485 ]
recall:    [0.76851852 0.93591772]
f1 score:  [0.71706263 0.94753704]
--------------------------------------------------
              negative    positive
precision: [0.70539419 0.96287328]
recall:    [0.78703704 0.94382911]
f1 score:  [0.74398249 0.95325609]
--------------------------------------------------
              negative    positive
precision: [0.65714286 0.95465587]
recall:    [0.74193548 0.93349169]
f1 score:  [0.6969697  0.94395516]
--------------------------------------------------
accuracy: 91.41% (+/- 0.63%)
precision: 82.25

After oversampling the negative class using the smote technique, the model can better predict negative reviews using the same toy dataset

In [19]:
for n, lr in enumerate(lr_regressors_smote):
    print('logistic regressor',n, "predicted values for toy dataset:", lr.predict(df_toy))

logistic regressor 0 predicted values for toy dataset: ['neg' 'pos' 'neg' 'pos']
logistic regressor 1 predicted values for toy dataset: ['neg' 'pos' 'neg' 'pos']
logistic regressor 2 predicted values for toy dataset: ['neg' 'pos' 'neg' 'pos']
logistic regressor 3 predicted values for toy dataset: ['neg' 'pos' 'neg' 'pos']
logistic regressor 4 predicted values for toy dataset: ['neg' 'pos' 'neg' 'pos']


## Including list of stop words to remove before training model

In [20]:
# get list of stop words to remove
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /media/ntu/volume1/home/s122md304_04/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler

# create the vectorizer, logistic_regression model
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=100000, ngram_range=(1,3))
lr = LogisticRegression()
# create SMOTE pipeline
smote_pipeline = make_pipeline(vectorizer, SMOTE(random_state=777),lr)

In [76]:
smote_pipeline

In [77]:
lr_regressors_smote, predictions_smote = logistic_regressor(n_splits = 5, X = df_train.reviews_cleaned, 
                                                            Y = df_train.sentiments_classified, pipeline=smote_pipeline, 
                                                            average_method='macro')

              negative    positive
precision: [0.7029703  0.94136044]
recall:    [0.65437788 0.95253165]
f1 score:  [0.6778043  0.94691309]
--------------------------------------------------
              negative    positive
precision: [0.69444444 0.94778481]
recall:    [0.69444444 0.94778481]
f1 score:  [0.69444444 0.94778481]
--------------------------------------------------
              negative    positive
precision: [0.69124424 0.94774347]
recall:    [0.69444444 0.94699367]
f1 score:  [0.69284065 0.94736842]
--------------------------------------------------
              negative    positive
precision: [0.67948718 0.95425361]
recall:    [0.73611111 0.94066456]
f1 score:  [0.70666667 0.94741036]
--------------------------------------------------
              negative    positive
precision: [0.61382114 0.9465154 ]
recall:    [0.69585253 0.92478226]
f1 score:  [0.65226782 0.93552263]
--------------------------------------------------
accuracy: 90.64% (+/- 0.76%)
precision: 81.20

## RandomOverSampler

In [23]:
# create the vectorizer, logistic_regression model
vectorizer = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,3))
lr = LogisticRegression()
# create ROS pipeline
ros_pipeline = make_pipeline(vectorizer, RandomOverSampler(random_state=777), lr)

In [24]:
ros_pipeline

Again there is relatively high precision and recall for positive classes and improved precision and recall for negative classes. The results is comparable to the SMOTE technique

In [25]:
lr_regressors_ros, predictions_ros = logistic_regressor(n_splits = 5, X = df_train.reviews_cleaned, Y = df_train.sentiments_classified, pipeline=ros_pipeline, average_method='macro')

              negative    positive
precision: [0.69387755 0.96197411]
recall:    [0.78341014 0.94066456]
f1 score:  [0.73593074 0.9512    ]
--------------------------------------------------
              negative    positive
precision: [0.68325792 0.94837172]
recall:    [0.69907407 0.94462025]
f1 score:  [0.69107551 0.94649227]
--------------------------------------------------
              negative    positive
precision: [0.6733871  0.96022727]
recall:    [0.77314815 0.93591772]
f1 score:  [0.71982759 0.94791667]
--------------------------------------------------
              negative    positive
precision: [0.70833333 0.96290323]
recall:    [0.78703704 0.94462025]
f1 score:  [0.74561404 0.95367412]
--------------------------------------------------
              negative    positive
precision: [0.66007905 0.9592502 ]
recall:    [0.76958525 0.93190816]
f1 score:  [0.7106383  0.94538153]
--------------------------------------------------
accuracy: 91.37% (+/- 0.52%)
precision: 82.12

In [26]:
for n, lr in enumerate(lr_regressors_ros):
    print('logistic regressor',n, "predicted values for toy dataset:", lr.predict(df_toy))

logistic regressor 0 predicted values for toy dataset: ['neg' 'pos' 'neg' 'pos']
logistic regressor 1 predicted values for toy dataset: ['neg' 'pos' 'neg' 'pos']
logistic regressor 2 predicted values for toy dataset: ['neg' 'pos' 'neg' 'pos']
logistic regressor 3 predicted values for toy dataset: ['neg' 'pos' 'neg' 'pos']
logistic regressor 4 predicted values for toy dataset: ['neg' 'pos' 'neg' 'pos']


# Prediction using LogisticRegression Model and SMOTE

In [29]:
%%time
df_test['sentiments_predicted'] = lr_regressors_smote[0].predict(df_test.reviews_cleaned)

CPU times: user 152 ms, sys: 1.3 ms, total: 154 ms
Wall time: 152 ms


In [30]:
df_test[['reviews', 'sentiments_predicted']][0:10]

Unnamed: 0,reviews,sentiments_predicted
0,I bought 2 sleepers. sleeper had holes in the arm pit area and the other sleeper had a whole where the neck trim should of been sewed on. A real waste of my money,neg
1,"I dare say these are just about the sexiest things I've ever worn. Oh I've had and have G-strings, have some pretty skimpy ones too. But a crotchless G-String, masqurading as a crotchless pantie, what a concept. Try going outside in a short skirt with nothing under it but these. Might as well be walking around naked under that skirt. But then again I've done that. However, wearing these panties slash G-string just seems sexier than naked",pos
2,"everything about the transaction (price, delivery time, quality of item) was great. I wouldn't hesitate to purchase something again from this seller",pos
3,"Not bad for just a shirt. Very durable, and matched my teams colors perfectly. Its just a shirt, but it helped my team and I go on to greatness.....",pos
4,"These are truly wrinkle free and longer than the average womans botton down, which I love!! Overall, these are fabulous shirts and you can't beat the price",pos
5,I love naughty monkey! I'm so happy with their shoes! They don't hurt my feet,pos
6,I fell in love with this boot when I first saw it. It was on another store's website for 159.95. I found them on amazon for 99.95 and just waited it out. About 2 months later the price dropped to 44.95 and the total purchase was less than 50.00 including shipping. The boots were exactly what I expected and I always get complimented on them. Thanks Amazon,pos
7,"These shades are a great buy. Fast shipping, great price, and good quality",pos
8,"There was no picture of this selection, so I guessed it was a ""ringer"" type t-shirt. Instead it was a white shirt with ugly red contrast stitching. I never wore it, but gave it to Goodwill",neg
9,"This leather briefcase was exactly what I was looking for. The leather is smooth and supple and holds up well. It's big enough to accommodate my 13"" MacBook along with an assortment of other papers, iPod, and so forth. I highly recommend it",pos


In [31]:
df_test.to_csv('IE4483_Mini_Project/lr_submission.csv', index=False)

# Exploring incorrectly classified data

We study a few cases of incorrectly classified data

In [32]:
idx = [63, 195]
df_wrong = df_test[df_test.index.isin(idx)]

In [33]:
df_wrong[['reviews', 'sentiments_predicted']]

Unnamed: 0,reviews,sentiments_predicted
63,"These are the BEST panties I've ever had! They don't ride up, they feel like they are not even there and they don't cut into your skin! They are so smooth that my jeans don't pull them up either! If you're tired of digging your panties out, even the victoria secret's barely there panties, buy these! They are much cheaper here on Amazon than at Beall's department store. I paid $11.99 for 3 pair at the store.",neg
195,"These are the best!!! When they say skinny-they mean it. I cannot find any jeans that will fit this tall skinny child, and not look to baggy.(which she hates) We still needed the adjustable waist(believe it or not) But the difference was the slim cut through the hinnie and hips-she looks like a regular little girl, instead of a skinny girl with pants that are way to big",neg


The possible reasoning for this wrong classificationis that the target label (pos or neg) may not have linear correlation with the features (tfidf format of reviews). Instead we choose another model that may work without needing this dependency between the class label and feature \
DecisionTree is another possible model for this problem

In [34]:
def decision_tree(n_splits, X,Y, pipeline, average_method):
    
    # StratifiedKFold, use this as a tool to improve model performance when dealing with imbalance datasets
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=777)
    
    # create the lists to store the different required information
    decision_trees, predictions, accuracy = [], [], []

    for train, test in kfold.split(X, Y):
        
        dt_fit = pipeline.fit(X[train], Y[train])
        prediction = dt_fit.predict(X[test])
        scores = dt_fit.score(X[test], Y[test])
        
        decision_trees.append(dt_fit)
        predictions.append(prediction)
        accuracy.append(scores * 100)
        print('score:', scores)

    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    
    return decision_trees, predictions

In [35]:
from sklearn.tree import DecisionTreeClassifier
vectorizer = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,3))
dt = DecisionTreeClassifier()
tree_pipeline = make_pipeline(vectorizer, SMOTE(random_state=777),dt)

In [36]:
tree_pipeline

In [37]:
decision_trees_smote, predictions_smote = decision_tree(n_splits = 5, X = df_train.reviews_cleaned, Y = df_train.sentiments_classified, pipeline=tree_pipeline, average_method='macro')

score: 0.8365968939905469
score: 0.8168918918918919
score: 0.8256756756756757
score: 0.8297297297297297
score: 0.8216216216216217
accuracy: 82.61% (+/- 0.68%)


## Predicting using DecisionTree + SMOTE

In [45]:
df_test_dt = deepcopy(df_test)
df_test_dt['sentiments_predicted'] = decision_trees_smote[0].predict(df_test_dt.reviews_cleaned)

In [46]:
df_test_dt.to_csv('IE4483_Mini_Project/dt_submission.csv', index=False)

## Check cases where LR model classified reviews correctly and incorrectly

In [38]:
# we choose the decision tree with the best accuracy and check if it correctly predicts the reviews predicted wrongly by the logistic regression model
df_wrong_copy = deepcopy(df_wrong)
df_wrong_copy['sentiments_predicted'] = decision_trees_smote[0].predict(df_wrong_copy.reviews_cleaned)

Here we can see that the reviews are predicted correctly for the 2 case selected

In [39]:
df_wrong_copy

Unnamed: 0,reviews,reviews_cleaned,sentiments_predicted
63,"These are the BEST panties I've ever had! They don't ride up, they feel like they are not even there and they don't cut into your skin! They are so smooth that my jeans don't pull them up either! If you're tired of digging your panties out, even the victoria secret's barely there panties, buy these! They are much cheaper here on Amazon than at Beall's department store. I paid $11.99 for 3 pair at the store.",these be the BEST pantie have ever have they do not ride up they feel like they be not even there and they do not cut into your skin they be so smooth that my jean do not pull they up either if you be tired of dig your pantie out even the victoria secret be barely there pantie buy these they be much cheap here on Amazon than at Beall department store pay for pair at the store,pos
195,"These are the best!!! When they say skinny-they mean it. I cannot find any jeans that will fit this tall skinny child, and not look to baggy.(which she hates) We still needed the adjustable waist(believe it or not) But the difference was the slim cut through the hinnie and hips-she looks like a regular little girl, instead of a skinny girl with pants that are way to big",these be the good when they say skinny they mean it can not find any jean that will fit this tall skinny child and not look to baggywhich she hate we still need the adjustable waistbelieve it or not but the difference be the slim cut through the hinnie and hip she look like regular little girl instead of skinny girl with pant that be way to big,pos


How about if we check out correctly predicted reviews? can the same decision tree also predict the reviews correctly? \
Here we pick one negative review and positive review predicted correctly

In [40]:
df_correct = df_test[0:2]

In [41]:
df_correct[['reviews', 'sentiments_predicted']]

Unnamed: 0,reviews,sentiments_predicted
0,I bought 2 sleepers. sleeper had holes in the arm pit area and the other sleeper had a whole where the neck trim should of been sewed on. A real waste of my money,neg
1,"I dare say these are just about the sexiest things I've ever worn. Oh I've had and have G-strings, have some pretty skimpy ones too. But a crotchless G-String, masqurading as a crotchless pantie, what a concept. Try going outside in a short skirt with nothing under it but these. Might as well be walking around naked under that skirt. But then again I've done that. However, wearing these panties slash G-string just seems sexier than naked",pos


In [42]:
df_correct['sentiments_predicted'] = decision_trees_smote[0].predict(df_correct.reviews_cleaned)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_correct['sentiments_predicted'] = decision_trees_smote[0].predict(df_correct.reviews_cleaned)


Both reviews are also correctly predicted.

In [43]:
df_correct

Unnamed: 0,reviews,reviews_cleaned,sentiments_predicted
0,I bought 2 sleepers. sleeper had holes in the arm pit area and the other sleeper had a whole where the neck trim should of been sewed on. A real waste of my money,buy sleeper sleeper have hole in the arm pit area and the other sleeper have whole where the neck trim should of be sew on real waste of my money,neg
1,"I dare say these are just about the sexiest things I've ever worn. Oh I've had and have G-strings, have some pretty skimpy ones too. But a crotchless G-String, masqurading as a crotchless pantie, what a concept. Try going outside in a short skirt with nothing under it but these. Might as well be walking around naked under that skirt. But then again I've done that. However, wearing these panties slash G-string just seems sexier than naked",dare say these be just about the sexy thing have ever wear oh have have and have string have some pretty skimpy one too but crotchless String masqurade as crotchless pantie what concept try go outside in short skirt with nothing under it but these might as well be walk around naked under that skirt but then again have do that however wear these pantie slash string just seem sexy than naked,pos


Of course the examples shown above is not indicative that the DecisionTree is better than the LogisticRegressor when it comes to predicting the sentiments, but it could be that one model is better than the other for some cases

# Dataset without labels

Without any training labels, we are looking at unsupervised learning models. Logistic Regression is a supervised learning model and will not be a correct model to tackle the problem. \
Instead, an unsupervised algorithm like K-Means clustering may be a good way to tackle the problem

Instead of going through the LR model, we instead change the model to a K-Means model, which only requries the training dataset (reviews processed) and a cluster of 2 (0/1, pos/neg class labels)

## k-means
We note here that the limitations of k-means is that it does not train well with imbalanced dataset (given that we know the dataset we have has more positive classes than negative classes). \
However, without any ratings that indicates the class labels, `LogisticRegression`, which is a supervised model cannot give us any meaningful results. \
On the other hand, unsupervised models like `K-Means` is more suited for the job.

In [44]:
from sklearn.cluster import KMeans
vectorizer = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,3))
tfidf = vectorizer.fit_transform(df_train.reviews_cleaned.values)
kmeans = KMeans(n_clusters=5, init='k-means++').fit(tfidf)

In [45]:
predict_tfidf = vectorizer.transform(list(df_test.reviews_cleaned))
kmeans.predict(predict_tfidf)

array([0, 2, 0, ..., 2, 2, 1], dtype=int32)

In [46]:
df_test_kmeans = deepcopy(df_test)
df_test_kmeans['sentiments_predicted'] = kmeans.predict(predict_tfidf)

In [47]:
df_test_kmeans.sentiments_predicted.value_counts()

0    585
2    496
1    459
3    238
4    73 
Name: sentiments_predicted, dtype: int64

In [48]:
df_test_kmeans[df_test_kmeans['sentiments_predicted'] == 1]

Unnamed: 0,reviews,reviews_cleaned,sentiments_predicted
8,"There was no picture of this selection, so I guessed it was a ""ringer"" type t-shirt. Instead it was a white shirt with ugly red contrast stitching. I never wore it, but gave it to Goodwill",there be no picture of this selection so guess it be ringer type shirt instead it be white shirt with ugly red contrast stitching never wear it but give it to Goodwill,1
10,"Well, I recieved this product as it was one of the only cases made for the Video Ipod when I got mine. As soon as i got it, I put it on.... it was a bit difficult at first, but once in there snug, i have never taken it out. About... 2 days later, my IPod fell out of my jacket pocket and tumbled hard down a flight of CONCRETE stairs. I was sure that it was damaged... but, I opened the case, turned it on, and and was fine.... I was quite relieved. That alone justifies the price of the case. I am proud to say that it is still in there to this day, and the only thing I could have asked for would have been a somewhat tighter fit around the clickwheel, as there is a bit of space for dirt/dust to get into. That and the whole made for the headphones could be a tad bigger.",well recieve this product as it be of the only case make for the Video Ipod when get mine as soon as get it put it on it be bit difficult at but once in there snug have never take it out about day later my IPod fall out of my jacket pocket and tumble hard down flight of concrete stair be sure that it be damage but open the case turn it on and and be fine be quite relieved that alone justify the price of the case be proud to say that it be still in there to this day and the only thing could have ask for would have be somewhat tight fit around the clickwheel as there be bit of space for dirt dust to get into that and the whole make for the headphone could be tad big,1
11,"I'm female, but only buy the men's versions of pants. This is only partly because of how comfortable they are (loose fit means loose with men's stuff, which is what I prefer). I bought this chromium gray pair from M A C Y * S directly, but they weren't any cheaper. I have 2 other pairs of the same pants in khaki and black (also not from A m a z o n).\n\nThese pants are true cargo with the side leg pockets, the flaps on the rear pockets (carpenter pants don't have the closeable flap on back, but still have at least one side pocket). The material is soft cotton. I feel like I'm wearing a pair of pajamas when I wear these and I love that. The pants wrinkle easily, but that's the price you pay for sheer comfort. And the Levi's brand should mean they last ""forever""",be female but only buy the man version of pant this be only partly because of how comfortable they be loose fit mean loose with man stuff which be what prefer buy this chromium gray pair from directly but they be not any cheap have other pair of the same pant in khaki and black also not from these pant be true cargo with the side leg pocket the flap on the rear pocket carpenter pant do not have the closeable flap on back but still have at least side pocket the material be soft cotton feel like be wear pair of pajama when wear these and love that the pant wrinkle easily but that be the price you pay for sheer comfort and the Levi brand should mean they last forever,1
16,I wore this shirt just once. It's almost transparent. Not worth it even if you get it for free,wear this shirt just once it be almost transparent not worth it even if you get it for free,1
17,i have never had an experience on amazon like this before! arrived so quickly that i didnt even believe it had come until i opened the package and saw that it was indeed what i ordered. amazing service!!!!!!!!,have never have an experience on amazon like this before arrive so quickly that do not even believe it have come until open the package and see that it be indeed what order amazing service,1
20,"Decent lightweight jacket for anytime, not sure if it is just my jacekt, but one of my sleeves seems to be sewed slightly skewed so I have to twist it around to get my arm to go in. Could send it back, but not worth the trouble",decent lightweight jacket for anytime not sure if it be just my jacekt but of my sleeve seem to be sew slightly skewed so have to twist it around to get my arm to go in could send it back but not worth the trouble,1
28,"I'm very particular about the feel of a material, especially my sleepwear, and this product if perfect. Its very comfortable, and the material is breathable which is what I like. I do however think that the length could've been a little shorter, but that's a choice thing I guess",be very particular about the feel of material especially my sleepwear and this product if perfect its very comfortable and the material be breathable which be what like do however think that the length could have be little short but that be choice thing guess,1
30,"As a breastfeeding mother, it's important to me that I keep my bras dry and leak-free (not to mention my clothes). I've been using my Lily Padz since my daughter (now nearly 4 weeks old) was a few days old. (We had latch-on problems initially, so I used washable pads until my nipples healed.) I really like this product. It's smooth under clothing, and it clings pretty reliably. It isn't perfect, but it doesn't claim to be. When I have a gushing let-down, nothing will contain it, not Lily Padz nor cloth breast pads. That's a little disappointing, but I suppose it's to be expected. I only wish that the tackiness held up a little better. I have tended lovingly to my Lily Padz, following instructions on washing and drying and what not. In fact, I only wear them during the day. At night, I wash them before I go to bed, and leave them on the domes to dry over night (using the washables at night). Despite following the instructions on Lily Padz maintenance very carefully, I feel they've lost their tackiness, and I worry about their effectiveness. So far, no big problems, but I'd feel more secure if they did not appear to have started to wear out already. Still - they're totally worth the investment, and I may just buy another pair, so I can alternate and keep wear to a minimum",as breastfeed mother it be important to that keep my bras dry and leak free not to mention my clothe have be use my Lily Padz since my daughter now nearly week old be few day old we have latch on problem initially so use washable pad until my nipple heal really like this product it be smooth under clothing and it cling pretty reliably it be not perfect but it do not claim to be when have gushing let down nothing will contain it not Lily Padz nor cloth breast pad that be little disappointing but suppose it be to be expect only wish that the tackiness hold up little well have tend lovingly to my Lily Padz follow instruction on washing and dry and what not in fact only wear they during the day at night wash they before go to bed and leave they on the dome to dry over night use the washable at night despite follow the instruction on Lily Padz maintenance very carefully feel they have lose their tackiness and worry about their effectiveness so far no big problem but would feel more secure if they do not appear to have start to wear out already still they be totally worth the investment and may just buy another pair so can alternate and keep wear to minimum,1
37,"i bought a green ipod scok, thinking the it would be that same as the picture they show at the top of the page, but it ended up having a lighter green than what they showed and what i wanted.",buy green ipod scok think the it would be that same as the picture they show at the top of the page but it end up have light green than what they show and what want,1
41,"Our 6-year old wants to be a policeman when he grows up. He could not wait to get this belt. We had purchased the police uniform earlier. The first time I ordered the belt, it got lost in the mail so he had to wait almost a month to get it. When it finally arrived, he could have wrapped it around his waist 3 times it was so long. It is made of very thin vinyl and does not hold up very well to having holes punched in it. It is less than 4 weeks old and probably won't make it through the rest of the month",our year old want to be policeman when he grow up he could not wait to get this belt we have purchase the police uniform early the time order the belt it got lose in the mail so he have to wait almost month to get it when it finally arrive he could have wrap it around his waist time it be so long it be make of very thin vinyl and do not hold up very well to have hole punch in it it be less than week old and probably will not make it through the rest of the month,1


In [51]:
df_test_kmeans['sentiments_predicted_pos_neg'] = df_test_kmeans['sentiments_predicted'].apply(lambda row: "pos" if row >=2 else "neg")

In [52]:
df_test_kmeans['sentiments_predicted_pos_neg'].value_counts()

neg    1044
pos    807 
Name: sentiments_predicted_pos_neg, dtype: int64

## vader
Another more useful way is to use vader from nltk to predict this reviews. It is a rule based sentiment analysis tool. \
It tells us how positive or negative a sentiment is

In [53]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /media/ntu/volume1/home/s122md304_04/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [54]:
def sentiment_scores(text):
    
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    
    analyzer = SentimentIntensityAnalyzer()
    sentiments_dict = analyzer.polarity_scores(text)
    
    compound_score = sentiments_dict['compound']
    
    if compound_score >= 0.05:
        sentiment_predicted = "pos"
    elif -0.05 < compound_score < 0.05:
        sentiment_predicted = "neu"
    elif compound_score <= -0.05:
        sentiment_predicted = "neg"
    
    return sentiment_predicted

In [55]:
df_test_nltk = deepcopy(df_test)

In [56]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

df_test_nltk['sentiments_predicted'] = df_test_nltk['reviews_cleaned'].apply(lambda row: sentiment_scores(row))
df_test_nltk['neg_score'] = df_test_nltk['reviews_cleaned'].apply(lambda row: analyzer.polarity_scores(row)['neg'])
df_test_nltk['pos_score'] = df_test_nltk['reviews_cleaned'].apply(lambda row: analyzer.polarity_scores(row)['pos'])
df_test_nltk['neu_score'] = df_test_nltk['reviews_cleaned'].apply(lambda row: analyzer.polarity_scores(row)['neu'])

In [57]:
df_test_nltk['sentiments_predicted'].value_counts()

pos    1626
neg    166 
neu    59  
Name: sentiments_predicted, dtype: int64

In [58]:
df_test_nltk[0:10]

Unnamed: 0,reviews,reviews_cleaned,sentiments_predicted,neg_score,pos_score,neu_score
0,I bought 2 sleepers. sleeper had holes in the arm pit area and the other sleeper had a whole where the neck trim should of been sewed on. A real waste of my money,buy sleeper sleeper have hole in the arm pit area and the other sleeper have whole where the neck trim should of be sew on real waste of my money,neg,0.088,0.0,0.912
1,"I dare say these are just about the sexiest things I've ever worn. Oh I've had and have G-strings, have some pretty skimpy ones too. But a crotchless G-String, masqurading as a crotchless pantie, what a concept. Try going outside in a short skirt with nothing under it but these. Might as well be walking around naked under that skirt. But then again I've done that. However, wearing these panties slash G-string just seems sexier than naked",dare say these be just about the sexy thing have ever wear oh have have and have string have some pretty skimpy one too but crotchless String masqurade as crotchless pantie what concept try go outside in short skirt with nothing under it but these might as well be walk around naked under that skirt but then again have do that however wear these pantie slash string just seem sexy than naked,pos,0.033,0.142,0.825
2,"everything about the transaction (price, delivery time, quality of item) was great. I wouldn't hesitate to purchase something again from this seller",everything about the transaction price delivery time quality of item be great would not hesitate to purchase something again from this seller,pos,0.0,0.228,0.772
3,"Not bad for just a shirt. Very durable, and matched my teams colors perfectly. Its just a shirt, but it helped my team and I go on to greatness.....",not bad for just shirt very durable and match my team color perfectly its just shirt but it help my team and go on to greatness,pos,0.0,0.26,0.74
4,"These are truly wrinkle free and longer than the average womans botton down, which I love!! Overall, these are fabulous shirts and you can't beat the price",these be truly wrinkle free and long than the average woman botton down which love overall these be fabulous shirt and you can not beat the price,pos,0.0,0.375,0.625
5,I love naughty monkey! I'm so happy with their shoes! They don't hurt my feet,love naughty monkey be so happy with their shoe they do not hurt my foot,pos,0.0,0.494,0.506
6,I fell in love with this boot when I first saw it. It was on another store's website for 159.95. I found them on amazon for 99.95 and just waited it out. About 2 months later the price dropped to 44.95 and the total purchase was less than 50.00 including shipping. The boots were exactly what I expected and I always get complimented on them. Thanks Amazon,fall in love with this boot when see it it be on another store website for find they on amazon for and just wait it out about month later the price drop to and the total purchase be less than include shipping the boot be exactly what expect and always get compliment on they thank Amazon,pos,0.032,0.202,0.766
7,"These shades are a great buy. Fast shipping, great price, and good quality",these shade be great buy fast shipping great price and good quality,pos,0.0,0.552,0.448
8,"There was no picture of this selection, so I guessed it was a ""ringer"" type t-shirt. Instead it was a white shirt with ugly red contrast stitching. I never wore it, but gave it to Goodwill",there be no picture of this selection so guess it be ringer type shirt instead it be white shirt with ugly red contrast stitching never wear it but give it to Goodwill,neg,0.111,0.0,0.889
9,"This leather briefcase was exactly what I was looking for. The leather is smooth and supple and holds up well. It's big enough to accommodate my 13"" MacBook along with an assortment of other papers, iPod, and so forth. I highly recommend it",this leather briefcase be exactly what be look for the leather be smooth and supple and hold up well it be big enough to accommodate my MacBook along with an assortment of other paper iPod and so forth highly recommend it,pos,0.0,0.117,0.883


# Feature Format Exploration
Here we would like to check out how the TF-IDF feature format compares with the Bag of Words format in terms of resources required and model accuracy \
We utilise the sklearn `CountVectorizer` and `TfidfVectorizer` libraries

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [80]:
%%time
# speed of tfidf based on sklearn library TFidfVectorizer
tfidf_transformer = TfidfVectorizer()
tfidf_reviews = tfidf_transformer.fit_transform(list(df_train['reviews_cleaned']))

CPU times: user 218 ms, sys: 848 µs, total: 219 ms
Wall time: 218 ms


In [84]:
%%time
# speed of bow based on sklearn library CountVectorizer
bow_transformer = CountVectorizer()
bow_reviews = bow_transformer.fit_transform(list(df_train['reviews_cleaned']))

CPU times: user 212 ms, sys: 220 µs, total: 212 ms
Wall time: 239 ms


memory wise the datasets converted has the same amount of memory usage when we check their pandas DataFrame

In [104]:
tfidf_tokens = tfidf_transformer.get_feature_names()
df_tfidfvect = pd.DataFrame(data = tfidf_reviews.toarray(), columns = tfidf_tokens)



In [108]:
df_tfidfvect.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7401 entries, 0 to 7400
Columns: 11312 entries, aa to zyliss
dtypes: float64(11312)
memory usage: 638.7 MB


In [109]:
bow_tokens = bow_transformer.get_feature_names()
df_bowvect = pd.DataFrame(data = bow_reviews.toarray(), columns = bow_tokens)



In [110]:
df_bowvect.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7401 entries, 0 to 7400
Columns: 11312 entries, aa to zyliss
dtypes: int64(11312)
memory usage: 638.7 MB


Here we check the accuracy of using each feature format. We had previously ran the pipeline using Tf-IDF. Let us check out how the BOW feature format stacks up

In [114]:
# create the vectorizer, logistic_regression model
vectorizer = CountVectorizer(stop_words=None, max_features=100000, ngram_range=(1,3))
lr = LogisticRegression(max_iter=200)
# create SMOTE pipeline
smote_pipeline = make_pipeline(vectorizer, SMOTE(random_state=777),lr)

In [115]:
smote_pipeline

We see here that the Tf-IDF format gives us a model with better accuracy

In [116]:
%%time
lr_regressors_smote, predictions_smote = logistic_regressor(n_splits = 5, X = df_train.reviews_cleaned, 
                                                            Y = df_train.sentiments_classified, pipeline=smote_pipeline, 
                                                            average_method='macro')

              negative    positive
precision: [0.59919028 0.94408428]
recall:    [0.68202765 0.92167722]
f1 score:  [0.63793103 0.9327462 ]
--------------------------------------------------
              negative    positive
precision: [0.66972477 0.94453249]
recall:    [0.67592593 0.94303797]
f1 score:  [0.67281106 0.94378464]
--------------------------------------------------
              negative    positive
precision: [0.64125561 0.94192522]
recall:    [0.66203704 0.93670886]
f1 score:  [0.65148064 0.9393098 ]
--------------------------------------------------
              negative    positive
precision: [0.69724771 0.94928685]
recall:    [0.7037037  0.94778481]
f1 score:  [0.70046083 0.94853523]
--------------------------------------------------
              negative    positive
precision: [0.65876777 0.93853428]
recall:    [0.640553   0.94299287]
f1 score:  [0.64953271 0.94075829]
--------------------------------------------------
accuracy: 89.96% (+/- 0.85%)
precision: 79.85

In [None]:
# e) strengths/weakness of model - what are the weaknesses and strengths of logistic regression? link back to cases in test dataset
# f) feature format choice
# g) sentiment classification without labels - unsupervise learning method (use K-Means clustering)
# h) 3 apparoaches to improve noisy data in classification - collect more data, noise filtering, use unsupervise models, use supervise models - decision trees, feature engineering