In [1]:
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score as acc
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
load_df = pd.read_csv("amazon_pillows.csv")

In [3]:
load_df.head()

Unnamed: 0,country,countryCode,date,filterByKeyword,filterByRating,isVerified,position,product/listPrice,product/price,productAsin,...,reviewImages/0,reviewImages/1,reviewImages/2,reviewReaction,reviewTitle,reviewUrl,reviewedIn,totalCategoryRatings,totalCategoryReviews,variant
0,United Kingdom,,2024-03-06,,threeStar,True,1,,,B00IMFWKGW,...,,,,,Disappointed,https://www.amazon.co.uk/gp/customer-reviews/R...,Reviewed in the United Kingdom on 6 March 2024,3743,405,Style Name: Super SupportSize Name: 2 Pack
1,United Kingdom,,2024-03-05,,threeStar,True,2,,,B00IMFWKGW,...,,,,,Nothing revoluntionary,https://www.amazon.co.uk/gp/customer-reviews/R...,Reviewed in the United Kingdom on 5 March 2024,3743,405,Style Name: Super SupportSize Name: 2 Pack
2,United Kingdom,,2024-03-03,,threeStar,True,3,,,B00IMFWKGW,...,,,,,Good firm support...,https://www.amazon.co.uk/gp/customer-reviews/R...,Reviewed in the United Kingdom on 3 March 2024,3743,405,Style Name: Super SupportSize Name: 2 Pack
3,United Kingdom,,2024-03-03,,threeStar,True,4,,,B00IMFWKGW,...,,,,,Slumbetdown pillows,https://www.amazon.co.uk/gp/customer-reviews/R...,Reviewed in the United Kingdom on 3 March 2024,3743,405,Style Name: Super SupportSize Name: 2 Pack
4,United Kingdom,,2024-03-02,,threeStar,True,5,,,B00IMFWKGW,...,,,,,Bouncy pillow,https://www.amazon.co.uk/gp/customer-reviews/R...,Reviewed in the United Kingdom on 2 March 2024,3743,405,Style Name: Super Support - Climate ControlSiz...


In [4]:
load_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   country               500 non-null    object 
 1   countryCode           0 non-null      float64
 2   date                  500 non-null    object 
 3   filterByKeyword       0 non-null      float64
 4   filterByRating        500 non-null    object 
 5   isVerified            500 non-null    bool   
 6   position              500 non-null    int64  
 7   product/listPrice     0 non-null      float64
 8   product/price         0 non-null      float64
 9   productAsin           500 non-null    object 
 10  ratingScore           500 non-null    int64  
 11  reviewCategoryUrl     500 non-null    object 
 12  reviewDescription     499 non-null    object 
 13  reviewId              500 non-null    object 
 14  reviewImages/0        20 non-null     object 
 15  reviewImages/1        6

In [5]:
df = load_df.iloc[:, [10, 12]]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ratingScore        500 non-null    int64 
 1   reviewDescription  499 non-null    object
dtypes: int64(1), object(1)
memory usage: 7.9+ KB


In [7]:
df["reviewDescription"] = df["reviewDescription"].convert_dtypes(convert_string=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["reviewDescription"] = df["reviewDescription"].convert_dtypes(convert_string=True)


In [8]:
scaler = MinMaxScaler(feature_range=(-1,1))
df["ratingSentiment"] = scaler.fit_transform(df[["ratingScore"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ratingSentiment"] = scaler.fit_transform(df[["ratingScore"]])


In [9]:
def classify_sentiment(value):
    if value > 0:
        return 'positive'
    elif value == 0:
        return 'neutral'
    else:
        return 'negative'
df["ratingClassified"] = df["ratingSentiment"].apply(classify_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ratingClassified"] = df["ratingSentiment"].apply(classify_sentiment)


In [10]:
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.drop(columns = ["index"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns = ["index"], inplace=True)


In [11]:
import re
from nltk.stem import WordNetLemmatizer

In [12]:
def preprocess(textdata):
    processedText = []

    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()

    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"

    for review in textdata["reviewDescription"]:
        review = review.lower()

        # Replace all URls with 'URL'
        review = re.sub(urlPattern,' URL',review)
        # Replace all non alphabets.
        review = re.sub(alphaPattern, " ", review)
        # Replace 3 or more consecutive letters by 2 letter.
        review = re.sub(sequencePattern, seqReplacePattern, review)

        reviews = ''
        for word in review.split():
            # Checking if the word is a stopword.
            #if word not in stopwordlist:
            if len(word)>1:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                reviews += (word+' ')

        processedText.append(reviews)

    return processedText

In [13]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tony\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
cleaned_df = preprocess(df)
cleaned_df = pd.DataFrame(cleaned_df, columns = ["cleanDescription"]).reset_index()

In [15]:
df["cleanDesc"] = cleaned_df["cleanDescription"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cleanDesc"] = cleaned_df["cleanDescription"]


In [16]:
df.tail()

Unnamed: 0,ratingScore,reviewDescription,ratingSentiment,ratingClassified,cleanDesc
494,2,I bought these pillows as a side sleeper but t...,-0.5,negative,bought these pillow a side sleeper but they ar...
495,2,Its looks comfort though.,-0.5,negative,it look comfort though
496,2,Advertised as firm but when you lay your head ...,-0.5,negative,advertised a firm but when you lay your head o...
497,2,"Really disappointed with these pillows,went fl...",-0.5,negative,really disappointed with these pillow went fla...
498,2,I like firm pillows for great head neck suppor...,-0.5,negative,like firm pillow for great head neck support p...


## Word2Vec Conversion

In [17]:
import en_core_web_lg
import spacy
nlp = en_core_web_lg.load()

In [18]:
def get_vec(x):
    doc = nlp(x)
    vec = doc.vector
    return vec

In [19]:
df["vec"] = df["cleanDesc"].apply( lambda x: get_vec(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["vec"] = df["cleanDesc"].apply( lambda x: get_vec(x))


In [20]:
df.head()

Unnamed: 0,ratingScore,reviewDescription,ratingSentiment,ratingClassified,cleanDesc,vec
0,3,Pillows have arrived and they seem very flat c...,0.0,neutral,pillow have arrived and they seem very flat co...,"[-1.4690695, 2.7291524, -3.4343462, -0.1550789..."
1,3,"Seemed to suggest they were ever plump, but so...",0.0,neutral,seemed to suggest they were ever plump but soo...,"[-1.8024529, 2.1519244, -2.025902, -0.2916843,..."
2,3,Feel good...,0.0,neutral,feel good,"[0.06918, -0.38524, -3.9240499, -3.42745, 0.59..."
3,3,Expected them to be firm but they are a little...,0.0,neutral,expected them to be firm but they are little t...,"[-1.7878836, 3.2223725, -5.2726912, 0.61334175..."
4,3,"Push your heads up,isn't really pillowy.Side s...",0.0,neutral,push your head up isn really pillowy side slee...,"[-0.6303085, 2.8425322, -4.204991, 1.0299304, ..."


In [21]:
X = df["vec"].to_numpy()
X = X.reshape(-1,1)

In [22]:
X = np.concatenate(np.concatenate(X, axis = 0), axis = 0).reshape(-1,300)

In [23]:
X.shape

(499, 300)

In [24]:
y = df["ratingClassified"]

In [25]:
LE = LabelEncoder()
y = LE.fit_transform(y)
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify= y)

In [27]:
X_train.shape, X_test.shape

((399, 300), (100, 300))

## Train Models

In [28]:
# logistic
clf = LogisticRegression(solver = "liblinear", max_iter=500)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy:', acc(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.78      0.74        40
           1       0.40      0.30      0.34        20
           2       0.83      0.85      0.84        40

    accuracy                           0.71       100
   macro avg       0.64      0.64      0.64       100
weighted avg       0.69      0.71      0.70       100

Accuracy: 0.71


In [29]:
##SVM
svm = SVC(kernel="linear")
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy:', acc(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.78      0.71        40
           1       0.27      0.20      0.23        20
           2       0.87      0.82      0.85        40

    accuracy                           0.68       100
   macro avg       0.60      0.60      0.60       100
weighted avg       0.66      0.68      0.67       100

Accuracy: 0.68


In [30]:
##Random Forest
rf = RandomForestClassifier(n_estimators=500, criterion="entropy")
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy:', acc(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.75      0.65        40
           1       0.50      0.05      0.09        20
           2       0.73      0.82      0.78        40

    accuracy                           0.64       100
   macro avg       0.60      0.54      0.50       100
weighted avg       0.62      0.64      0.59       100

Accuracy: 0.64


## CNN


In [31]:
df.head()

Unnamed: 0,ratingScore,reviewDescription,ratingSentiment,ratingClassified,cleanDesc,vec
0,3,Pillows have arrived and they seem very flat c...,0.0,neutral,pillow have arrived and they seem very flat co...,"[-1.4690695, 2.7291524, -3.4343462, -0.1550789..."
1,3,"Seemed to suggest they were ever plump, but so...",0.0,neutral,seemed to suggest they were ever plump but soo...,"[-1.8024529, 2.1519244, -2.025902, -0.2916843,..."
2,3,Feel good...,0.0,neutral,feel good,"[0.06918, -0.38524, -3.9240499, -3.42745, 0.59..."
3,3,Expected them to be firm but they are a little...,0.0,neutral,expected them to be firm but they are little t...,"[-1.7878836, 3.2223725, -5.2726912, 0.61334175..."
4,3,"Push your heads up,isn't really pillowy.Side s...",0.0,neutral,push your head up isn really pillowy side slee...,"[-0.6303085, 2.8425322, -4.204991, 1.0299304, ..."


In [32]:
from keras.utils import to_categorical
X = df.cleanDesc
y = df.ratingClassified
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)





In [33]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [35]:
from gensim.models import Word2Vec
Embedding_dimensions = 100

In [36]:
y.shape

(499,)

In [37]:
Y_train.shape

(399,)

In [38]:
Word2vec_train_data = list(map(lambda x: x.split(), X_train))
word2vec_model = Word2Vec(Word2vec_train_data,
                 vector_size=Embedding_dimensions,
                 workers=8,
                 min_count=5)

In [39]:
max_words = len(set(" ".join(X_train).split()))
max_len = X_train.apply(lambda x: len(x)).max()

max_words, max_len

(1166, 930)

In [40]:
# Deep Learing Preprocessing - Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

In [41]:
tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_seq = sequence.pad_sequences(X_train_seq, maxlen=max_len)

In [42]:
embedding_matrix = np.zeros((max_words, Embedding_dimensions))

for word, token in tokenizer.word_index.items():
    if word2vec_model.wv.__contains__(word):
        embedding_matrix[token] = word2vec_model.wv.__getitem__(word)

print("Embedding Matrix Shape:", embedding_matrix.shape)

Embedding Matrix Shape: (1166, 100)


In [43]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.25283343,  0.19370513,  0.13495384, ..., -0.13641578,
         0.18095252, -0.14879036],
       [-0.25103199,  0.18948929,  0.13097177, ..., -0.12709336,
         0.16424578, -0.14530817],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [44]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Dense, LSTM, Conv1D, Embedding, BatchNormalization

In [45]:
def getModel():
    embedding_layer = Embedding(input_dim = max_words,
                                output_dim = Embedding_dimensions,
                                weights=[embedding_matrix],
                                input_length=max_len,
                                )

    model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, dropout=0.25, return_sequences=True)),
        Bidirectional(LSTM(100, dropout=0.25, return_sequences=True)),
        BatchNormalization(),
        Conv1D(24, 3,padding = "valid", activation='relu'),
        GlobalMaxPool1D(),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax'),
    ],
    name="Sentiment_Model")
    return model

In [46]:
model1 = getModel()
model1.summary()


Model: "Sentiment_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 930, 100)          116600    
                                                                 
 bidirectional (Bidirection  (None, 930, 200)          160800    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 930, 200)          240800    
 onal)                                                           
                                                                 
 batch_normalization (Batch  (None, 930, 200)          800       
 Normalization)                                                  
                                                                 
 conv1d (Conv1D)             (None, 928, 24)           14424     
                                                  

In [47]:
from keras.optimizers import RMSprop, Adam, SGD
optimizer = Adam(0.001)
model1.compile(loss='sparse_categorical_crossentropy', optimizer= optimizer, metrics=['accuracy'])

In [54]:
from tensorflow.keras.callbacks import EarlyStopping
callbacks = EarlyStopping(
    monitor='accuracy',
    min_delta=0.01,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True,
    start_from_epoch=0
)

In [55]:
history = model1.fit(
    X_train_seq, Y_train,
    batch_size=32,
    epochs=50,
    validation_split=0.20,
    verbose=1,
    callbacks = callbacks
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
