In [61]:
import pandas as pd
import os
import numpy as np

In [62]:
df = pd.read_csv('/Users/marikhomeriki/code/marikhomeriki/product_review_analysis/raw_data/train_data/train.csv', header=None)

In [63]:
df_test = pd.read_csv('/Users/marikhomeriki/code/marikhomeriki/product_review_analysis/raw_data/test_data/test.csv', header=None)

In [64]:
df_test = df.rename({0: 'label', 1: 'text'}, axis = 1)

In [65]:
df = df.rename({0: 'label', 1: 'text'}, axis = 1)

In [66]:
df.head()

Unnamed: 0,label,text
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [67]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def load_data(data, percentage_of_sentences=None):
    
    train_sentences = data['text']
    y_train = data['label']
    
    
    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
        
        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
    X_train = [text_to_word_sequence(_) for _ in train_sentences]
    
  
    return X_train, y_train



In [68]:
X_train, y_train = load_data(df, percentage_of_sentences=10)

In [69]:
X_test, y_test = load_data(df_test, percentage_of_sentences=10)

In [70]:
len_train = int(10/100*len(df['text']))
X_test = df['text']
X_test = X_test[:len_train]

In [71]:
y_test_adj = y_test -1

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
    
# We apply the tokenization to the train and test set
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_token, maxlen=50, dtype='float32', padding="post")
X_test_pad = pad_sequences(X_test_token, maxlen=50, dtype='float32', padding="post")

vocab_size = len(tokenizer.word_index)

In [12]:
from tensorflow.keras import Sequential, layers

def init_cnn_model(vocab_size):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size + 1, output_dim=20, mask_zero=True, input_length=50))
    model.add(layers.Conv1D(32, 3))
    model.add(layers.Flatten())
    model.add(layers.Dense(50, activation="relu"))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_cnn = init_cnn_model(vocab_size)

In [13]:
model_cnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 20)            1378040   
                                                                 
 conv1d (Conv1D)             (None, 48, 32)            1952      
                                                                 
 flatten (Flatten)           (None, 1536)              0         
                                                                 
 dense (Dense)               (None, 50)                76850     
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 1,456,893
Trainable params: 1,456,893
Non-trainable params: 0
_________________________________________________________________


In [14]:
y_train_adj = y_train -1

In [15]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

model_cnn.fit(X_train_pad, y_train_adj, 
          epochs=20, 
          batch_size=32,
          validation_split=0.3,
          callbacks=[es]
         )


Epoch 1/20


2022-11-29 21:18:12.287539: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


<keras.callbacks.History at 0x309643f40>

In [16]:
res = model_cnn.evaluate(X_test_pad, y_test_adj)



In [37]:
y_pred = model_cnn.predict(X_test_pad)



In [38]:
y_pred

array([[0.0350421 ],
       [0.9829947 ],
       [0.00585087],
       ...,
       [0.9244309 ],
       [0.05504898],
       [0.06372894]], dtype=float32)

In [39]:
y_pred = np.round(y_pred)

In [40]:
y_pred = y_pred+1

In [41]:
y_pred = pd.DataFrame(y_pred)

In [42]:
y_pred

Unnamed: 0,0
0,1.0
1,2.0
2,1.0
3,1.0
4,2.0
...,...
55995,1.0
55996,1.0
55997,2.0
55998,1.0


In [73]:
X_test = pd.DataFrame(X_test)

In [74]:
X_test['label_pred'] = y_pred

In [75]:
X_test

Unnamed: 0,text,label_pred
0,"Unfortunately, the frustration of being Dr. Go...",1.0
1,Been going to Dr. Goldberg for over 10 years. ...,2.0
2,I don't know what Dr. Goldberg was like before...,1.0
3,I'm writing this review to give you a heads up...,1.0
4,All the food is great here. But the best thing...,2.0
...,...,...
55995,I took a party of 6 friends and co-workers the...,1.0
55996,Stop by for lunch based on Yelp reviews. Tasty...,1.0
55997,Best persian food ever.\nWOW!\n\nPossibly the ...,2.0
55998,"Habibbbbbb is the rudest, meanest, uglyist, gu...",1.0


In [76]:
X_test['sentiment'] = X_test['label_pred'].apply(lambda x: 'negative' if x == 1.0 else 'positive')

In [77]:
X_test

Unnamed: 0,text,label_pred,sentiment
0,"Unfortunately, the frustration of being Dr. Go...",1.0,negative
1,Been going to Dr. Goldberg for over 10 years. ...,2.0,positive
2,I don't know what Dr. Goldberg was like before...,1.0,negative
3,I'm writing this review to give you a heads up...,1.0,negative
4,All the food is great here. But the best thing...,2.0,positive
...,...,...,...
55995,I took a party of 6 friends and co-workers the...,1.0,negative
55996,Stop by for lunch based on Yelp reviews. Tasty...,1.0,negative
55997,Best persian food ever.\nWOW!\n\nPossibly the ...,2.0,positive
55998,"Habibbbbbb is the rudest, meanest, uglyist, gu...",1.0,negative


In [80]:
X_test.drop(columns='label_pred')

Unnamed: 0,text,sentiment
0,"Unfortunately, the frustration of being Dr. Go...",negative
1,Been going to Dr. Goldberg for over 10 years. ...,positive
2,I don't know what Dr. Goldberg was like before...,negative
3,I'm writing this review to give you a heads up...,negative
4,All the food is great here. But the best thing...,positive
...,...,...
55995,I took a party of 6 friends and co-workers the...,negative
55996,Stop by for lunch based on Yelp reviews. Tasty...,negative
55997,Best persian food ever.\nWOW!\n\nPossibly the ...,positive
55998,"Habibbbbbb is the rudest, meanest, uglyist, gu...",negative


In [86]:
X_test.sentiment.value_counts(normalize=True)

negative    0.545875
positive    0.454125
Name: sentiment, dtype: float64