In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
df = pd.read_parquet('train.parquet')

## Data

- 0: Negative sentiment
- 1: Positive sentiment

In [2]:
df.head()

Unnamed: 0,labels,review
0,0,Odd fit: I wanted to love this sweater but the...
1,1,Very comfy dress: The quality and material of ...
2,0,Fits nicely but fabric a bit thin: I ordered t...
3,1,"Great fit: Love these jeans, fit and style... ..."
4,0,"Stretches out, washes poorly. wish i could ret..."


In [3]:
df.groupby('labels').count()

Unnamed: 0_level_0,review
labels,Unnamed: 1_level_1
0,4679
1,15698


If you just labeled all the examples with the most prevalent label, you would this % accuracy (this is the baseline):

In [4]:
df.labels.mean()

0.7703783677675811

## Train Model

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=.005, max_df = .75, stop_words='english', strip_accents='ascii', )

In [6]:
res = cv.fit_transform(df['review'])
print(len(cv.vocabulary_))

766


In [7]:
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers

# A Nueral-Bag-Of-Words Model
inputs = tf.keras.Input(shape=(len(cv.vocabulary_),), name='Input')
x = layers.Dropout(0.10)(inputs)
x = layers.Dense(15, activation="relu", kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4))(x)
predictions = layers.Dense(1, activation="sigmoid",)(x)
model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
opt = optimizers.Adam(learning_rate=0.002)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

2022-07-18 11:40:29.254391: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
model.fit(x=res.toarray(), 
          y=df['labels'],
          batch_size=32, epochs=10, validation_split=.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fccb2e47f70>

In [9]:
# pdf = pd.read_parquet('predict.parquet')
# pres = cv.transform(pdf['review']).toarray()

In [10]:
review = ["poor fit"]
model.predict(cv.transform(review).toarray())

array([[0.02496627]], dtype=float32)

In [11]:
review = ["love it"]
model.predict(cv.transform(review).toarray())

array([[0.87660635]], dtype=float32)