In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.utils import set_random_seed
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
set_random_seed(2022)

## Data: Fashion Reviews

- 0: Negative sentiment
- 1: Positive sentiment

In [3]:
df = pd.read_parquet('train.parquet')
df.head()

Unnamed: 0,labels,review
0,0,Odd fit: I wanted to love this sweater but the...
1,1,Very comfy dress: The quality and material of ...
2,0,Fits nicely but fabric a bit thin: I ordered t...
3,1,"Great fit: Love these jeans, fit and style... ..."
4,0,"Stretches out, washes poorly. wish i could ret..."


### Baseline

In [4]:
df.groupby('labels').count()

Unnamed: 0_level_0,review
labels,Unnamed: 1_level_1
0,4679
1,15698


If you just labeled all the examples with the most prevalent label, you would this % accuracy (this is the baseline):

In [5]:
df.labels.mean()

0.7703783677675811

## Train Model

Preprocessing

In [6]:
cv = CountVectorizer(min_df=.005, max_df = .75, stop_words='english', strip_accents='ascii', )

In [7]:
res = cv.fit_transform(df['review'])
print(len(cv.vocabulary_))

766


 A Nueral-Bag-Of-Words Model

In [8]:
inputs = tf.keras.Input(shape=(len(cv.vocabulary_),), name='Input')
x = layers.Dropout(0.10)(inputs)
x = layers.Dense(15, activation="relu", kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4))(x)
predictions = layers.Dense(1, activation="sigmoid",)(x)
model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
opt = optimizers.Adam(learning_rate=0.002)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

2022-07-19 00:20:02.592770: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
model.fit(x=res.toarray(), 
          y=df['labels'],
          batch_size=32, epochs=10, validation_split=.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe71823f880>

## Make Predictions

These are toy examples:

In [10]:
review = ["poor fit its baggy in places where it isn't supposed to be.", # should be negative
          "love it, very high quality and great value"]                  #should be positive
model.predict(cv.transform(review).toarray())

array([[0.02719706],
       [0.9402847 ]], dtype=float32)

## Evaluate Model On Holdout Set

In [11]:
hdf = pd.read_parquet('predict.parquet')
predictions = model.predict(cv.transform(hdf['review']).toarray())
labels = hdf['labels']

In [12]:
accuracy_score(labels, predictions>.5)

0.8776501766784452

In [13]:
roc_auc_score(labels, predictions)

0.9141623123957754