In [1]:
# initial imports
import io
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import joblib

In [2]:
reviews= pd.read_csv("final.csv", index_col=0)
label = reviews["sentiment"]

# Text Representation

### Tf-Idf

In [24]:
text_tfidf = joblib.load('processed_tfidf.save')

### Doc2Vec

In [3]:
text_d2v = joblib.load('processed_d2v.save')

# Text Classification

### Train/Test Splitting

In [4]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder                   
from sklearn.model_selection import train_test_split             #-- Split Dataset
from sklearn.preprocessing import StandardScaler                 #-- Standard

In [7]:
encoder = LabelEncoder()
encoder.fit(label)
labels = encoder.transform(label)

In [25]:
#TF-IDF
X_trn_tfidf, X_tst_tfidf, y_trn, y_tst = train_test_split(text_tfidf,           #-- dataset
                                                                labels,                     #-- class
                                                                test_size = 0.30,          #-- 30% test set
                                                                random_state = 123)
scalar = StandardScaler(with_mean=False)
X_trn_tfidf_std = scalar.fit_transform(X_trn_tfidf)
X_tst_tfidf_std= scalar.transform(X_tst_tfidf)

In [8]:
#DOC2VEC
X_trn_d2v, X_tst_d2v, y_trn, y_tst = train_test_split(text_d2v,           #-- dataset
                                                                labels,                     #-- class
                                                                test_size = 0.30,          #-- 30% test set
                                                                random_state = 123)

### Support Vector Machines (SVM)

In [9]:
from sklearn.svm import LinearSVC

#### Tf- Idf

In [27]:
clf_tfidf = LinearSVC(C=0.001)

In [28]:
clf_tfidf.fit(X_trn_tfidf_std, y_trn)

In [29]:
preds_tfidf = clf_tfidf.predict(X_tst_tfidf_std)
print(classification_report(y_tst, preds_tfidf, digits=3, target_names=['Positive','Negative']))

              precision    recall  f1-score   support

    Positive      0.764     0.751     0.757     15101
    Negative      0.751     0.765     0.758     14864

    accuracy                          0.758     29965
   macro avg      0.758     0.758     0.758     29965
weighted avg      0.758     0.758     0.758     29965



#### Doc2Vec

In [31]:
clf_d2v = LinearSVC(C=0.001)

In [32]:
clf_d2v.fit(X_trn_d2v, y_trn)

In [33]:
preds_d2v = clf_d2v.predict(X_tst_d2v)
print(classification_report(y_tst, preds_d2v, digits=3, target_names=['Positive','Negative']))

              precision    recall  f1-score   support

    Positive      0.752     0.731     0.741     15101
    Negative      0.734     0.755     0.744     14864

    accuracy                          0.743     29965
   macro avg      0.743     0.743     0.743     29965
weighted avg      0.743     0.743     0.743     29965



### Multilayer Perceptron (MLP)

In [13]:
import tensorflow as tf
from tensorflow.keras import layers
import scipy

#### Tf-Idf

In [30]:
inputs = tf.keras.Input(shape=(1000))

x = layers.Dense(256, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(200, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(160, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(120, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(80, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)

prediction = layers.Dense(1, activation="sigmoid")(x)

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=6)
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./checkpoint/best_model.h5',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)
model = tf.keras.Model(inputs, prediction)
model.compile(loss="binary_crossentropy",
              optimizer="adam", metrics=["accuracy"], )

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(
    X_trn_tfidf_std, y_trn, test_size=0.2)

In [32]:
history = model.fit(X_train, Y_train, epochs=100,
          callbacks=[model_checkpoint_callback, callback], validation_data=(X_val, Y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [33]:
model.save('./mlp_tfidf_save')



INFO:tensorflow:Assets written to: ./mlp_tfidf_save\assets


INFO:tensorflow:Assets written to: ./mlp_tfidf_save\assets


In [34]:
model = tf.keras.models.load_model('./mlp_tfidf_save')
preds = model.predict(X_tst_tfidf_std)
preds = np.round(preds)
predictions = []

for item in preds:
    predictions.append(int(item[0]))

print(classification_report(y_tst, preds, digits=3, target_names=['Positive','Negative']))

              precision    recall  f1-score   support

    Positive      0.757     0.707     0.731     15101
    Negative      0.721     0.769     0.744     14864

    accuracy                          0.738     29965
   macro avg      0.739     0.738     0.737     29965
weighted avg      0.739     0.738     0.737     29965



#### Doc2Vec

In [14]:
# converti i dati del vettore in un array numpy
X_tr_d2v = np.asarray(X_trn_d2v)

# cambia il tipo di dati dell'array in float16
doc2vec_data = X_tr_d2v.astype(np.float16)

In [15]:
# converti i dati del vettore in un array numpy
X_te_d2v = np.asarray(X_tst_d2v)

# cambia il tipo di dati dell'array in float16
doc2vec_test_data = X_te_d2v.astype(np.float16)    

In [16]:
inputs = tf.keras.Input(shape=(doc2vec_data.shape[1]))

x = layers.Dense(256, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(200, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(160, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(120, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(80, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)

prediction = layers.Dense(1, activation="sigmoid")(x)

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./checkpoint/best_model_bow.h5',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)
model = tf.keras.Model(inputs, prediction)
model.compile(loss="binary_crossentropy",
              optimizer="adam", metrics=["accuracy"], )

In [17]:
X_train, X_val, Y_train, Y_val = train_test_split(
    doc2vec_data, y_trn, test_size=0.2)

In [18]:
history = model.fit(X_train, Y_train, epochs=100, batch_size=1024,
                    callbacks=[model_checkpoint_callback, callback], validation_data=(X_val, Y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


In [20]:
model.save('./mlp_d2v_save')



INFO:tensorflow:Assets written to: ./mlp_d2v_save\assets


INFO:tensorflow:Assets written to: ./mlp_d2v_save\assets


In [21]:
model = tf.keras.models.load_model('./mlp_d2v_save')
preds = model.predict(doc2vec_test_data)
preds = np.round(preds)
predictions = []

for item in preds:
    predictions.append(int(item[0]))

print(classification_report(y_tst, preds, digits=3, target_names=['Positive','Negative']))

              precision    recall  f1-score   support

    Positive      0.764     0.741     0.753     15101
    Negative      0.745     0.768     0.756     14864

    accuracy                          0.754     29965
   macro avg      0.755     0.755     0.754     29965
weighted avg      0.755     0.754     0.754     29965



### Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression,SGDClassifier

In [43]:
# training the model
lr = LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

#### Tf-Idf

In [44]:
# fitting the model 
lr_tfidf = lr.fit(X_trn_tfidf_std, y_trn)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [45]:
# predicting the model for tfidf features
lr_tfidf_predict = lr.predict(X_tst_tfidf_std)
print(lr_tfidf_predict)

[1 1 1 ... 1 1 0]


In [46]:
# accuracy score for tfidf features
lr_tfidf_score = accuracy_score(y_tst, lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.7583180377106624


In [47]:
# classification report for tfidf features
lr_tfidf_report = classification_report(y_tst, lr_tfidf_predict,
                                        digits=3, target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive      0.764     0.753     0.758     15101
    Negative      0.753     0.764     0.758     14864

    accuracy                          0.758     29965
   macro avg      0.758     0.758     0.758     29965
weighted avg      0.758     0.758     0.758     29965



#### Doc2Vec

In [48]:
# fitting the model
lr_d2v = lr.fit(X_trn_d2v, y_trn)
print(lr_d2v)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [49]:
# predicting the model
lr_d2v_predict = lr.predict(X_tst_d2v)
print(lr_d2v_predict)

[1 1 1 ... 1 0 0]


In [50]:
# accuracy score
lr_d2v_score = accuracy_score(y_tst, lr_d2v_predict)
print("lr_d2v_score :",lr_d2v_score)

lr_d2v_score : 0.7435007508760221


In [51]:
# classification report  
lr_d2v_report = classification_report(y_tst, lr_d2v_predict,
                                      digits=3, target_names=['Positive','Negative'])
print(lr_d2v_report)


              precision    recall  f1-score   support

    Positive      0.747     0.742     0.745     15101
    Negative      0.740     0.745     0.742     14864

    accuracy                          0.744     29965
   macro avg      0.743     0.744     0.743     29965
weighted avg      0.744     0.744     0.744     29965

