In [3]:
# Sentiment Analysis

# After doing some research, we will want to use TensorFlow, prob Keras (a deep
# learning API written on top of TensorFlow, it's currently being used
# in the LHC (Large Hadron Collider)).

# We will be classifying text with BERT:
# https://www.tensorflow.org/text/tutorials/classify_text_with_bert

# NOTE: as of 10/19/23 tensorflow will not run on windowns
# I recomend running this through jupyterlab on a linux kernal


In [4]:
# A dependency of the preprocessing for BERT inputs
!pip install -U "tensorflow-text==2.13.*"



In [5]:
# Use use the AdamW optimizer from https://github.com/tensorflow/models.
!pip install "tf-models-official==2.13.*"



In [6]:
# import nesisary libraries

import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text 
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [7]:
# Set up dataset directory structure
# This will make it easy to organize and accesss our data in our directory structure


dataset_dir = '../data/amazon_reviews'
os.makedirs(dataset_dir, exist_ok=True)

# Make it easy to access 'train' and 'test' directories inside the dataset directory
train_dir = os.path.join(dataset_dir, 'train')
test_dir = os.path.join(dataset_dir, 'test')

# This will build the 'train' and 'test' directories if they haven't been built yet
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# This will build the folders 1, 2, 3, 4, 5 inside both the train and test directories
one_dir_train = os.path.join(train_dir, '1')
two_dir_train = os.path.join(train_dir, '2')
three_dir_train = os.path.join(train_dir, '3')
four_dir_train = os.path.join(train_dir, '4')
five_dir_train = os.path.join(train_dir, '5')
os.makedirs(one_dir_train, exist_ok=True)
os.makedirs(two_dir_train, exist_ok=True)
os.makedirs(three_dir_train, exist_ok=True)
os.makedirs(four_dir_train, exist_ok=True)
os.makedirs(five_dir_train, exist_ok=True)
one_dir_test = os.path.join(test_dir, '1')
two_dir_test = os.path.join(test_dir, '2')
three_dir_test = os.path.join(test_dir, '3')
four_dir_test = os.path.join(test_dir, '4')
five_dir_test = os.path.join(test_dir, '5')
os.makedirs(one_dir_test, exist_ok=True)
os.makedirs(two_dir_test, exist_ok=True)
os.makedirs(three_dir_test, exist_ok=True)
os.makedirs(four_dir_test, exist_ok=True)
os.makedirs(five_dir_test, exist_ok=True)

In [8]:
# Downloading the dataset
# In order to run this project drop in the .csv data set (called: "Reviews") into the "data" folder
# You can find the data set here:
# https://www.google.com/url?q=https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews&sa=D&source=docs&ust=1695764896933142&usg=AOvVaw1WeATDdUdlTItgNGGldkRF

import pandas as pd

df_extra_params = pd.read_csv("../data/Reviews.csv")
# Only use the Score and Text paramaters
# (Score: 1-5 stars, Text: an amazon review)
df = df_extra_params[["Score", "Text"]]
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [14]:
# Copying the data into our directory structure
# We will seperate the data into the train and test folders
# Inside the train and test folders we have folders 1, 2, 3, 4, 5
# This corresponds to the "Score" of the review
# Split with 50/50 ratio, randomly divided 

# THIS IS A WORK IN PROGRESS

import shutil
import random

# Separate all data into 5 dfs for scores 1-5 respectively
df_1 = df.loc[(df["Score"] == 1)]
df_2 = df.loc[(df["Score"] == 2)]
df_3 = df.loc[(df["Score"] == 3)]
df_4 = df.loc[(df["Score"] == 4)]
df_5 = df.loc[(df["Score"] == 5)]

# Splitting data into test and train folders with 50/50 ratio
df_1_train = df_1.sample(frac=0.5, replace=False, random_state=1)
df_1_test = df_1[~df_1.isin(df_1_train)].dropna(how = 'all')

df_2_train = df_2.sample(frac=0.5, replace=False, random_state=1)
df_2_test = df_2[~df_2.isin(df_2_train)].dropna(how = 'all')

df_3_train = df_3.sample(frac=0.5, replace=False, random_state=1)
df_3_test = df_3[~df_3.isin(df_3_train)].dropna(how = 'all')

df_4_train = df_4.sample(frac=0.5, replace=False, random_state=1)
df_4_test = df_4[~df_4.isin(df_4_train)].dropna(how = 'all')

df_5_train = df_5.sample(frac=0.5, replace=False, random_state=1)
df_5_test = df_5[~df_5.isin(df_5_train)].dropna(how = 'all')

# Converting df values into txt files, putting them in correct folders
for index, row in df_1_test.iterrows():
    path = f'../data/amazon_reviews/test/1/review{index}.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)

for index, row in df_1_train.iterrows():
    path = f'../data/amazon_reviews/train/1/review{index}.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)

for index, row in df_2_test.iterrows():
    path = f'../data/amazon_reviews/test/2/review{index}.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)


for index, row in df_2_train.iterrows():
    path = f'../data/amazon_reviews/train/2/review{index}.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)


for index, row in df_3_test.iterrows():
    path = f'../data/amazon_reviews/test/3/review{index}.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)


for index, row in df_3_train.iterrows():
    path = f'../data/amazon_reviews/train/3/review{index}.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)

for index, row in df_4_test.iterrows():
    path = f'../data/amazon_reviews/test/4/review{index}.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)


for index, row in df_4_train.iterrows():
    path = f'../data/amazon_reviews/train/4/review{index}.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)

for index, row in df_5_test.iterrows():
    path = f'../data/amazon_reviews/test/5/review{index}.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)


for index, row in df_5_train.iterrows():
    path = f'../data/amazon_reviews/train/5/review{index}.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)
    
    



In [10]:
# AUTOTUNE = tf.data.AUTOTUNE
# batch_size = 32
# seed = 42

# raw_train_ds = tf.keras.utils.text_dataset_from_directory(
#     '../data/amazon_reviews/train',
#     batch_size=batch_size,
#     validation_split=0.2,
#     subset='training',
#     seed=seed)

# class_names = raw_train_ds.class_names
# train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

# val_ds = tf.keras.utils.text_dataset_from_directory(
#     'aclImdb/train',
#     batch_size=batch_size,
#     validation_split=0.2,
#     subset='validation',
#     seed=seed)

# val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# test_ds = tf.keras.utils.text_dataset_from_directory(
#     'aclImdb/test',
#     batch_size=batch_size)

# test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)