In [1]:
# Sentiment Analysis

# After doing some research, we will want to use TensorFlow, prob Keras (a deep
# learning API written on top of TensorFlow, it's currently being used
# in the LHC (Large Hadron Collider)).

# We will be classifying text with BERT:
# https://www.tensorflow.org/text/tutorials/classify_text_with_bert

# NOTE: as of 10/19/23 tensorflow will not run on windowns
# I recomend running this through jupyterlab on a linux kernal


In [2]:
# A dependency of the preprocessing for BERT inputs
!pip install -U "tensorflow-text==2.13.*"

Collecting tensorflow-text==2.13.*
  Downloading tensorflow_text-2.13.0-cp38-cp38-macosx_10_9_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 9.5 MB/s eta 0:00:01
[?25hCollecting tensorflow<2.14,>=2.13.0
  Downloading tensorflow-2.13.1-cp38-cp38-macosx_10_15_x86_64.whl (216.2 MB)
[K     |████████████████████████████████| 216.2 MB 5.4 kB/s eta 0:00:011            | 51.9 MB 20.6 MB/s eta 0:00:0819.2 MB/s eta 0:00:08     |████████████████▊               | 112.7 MB 9.8 MB/s eta 0:00:11     |██████████████████▋             | 125.7 MB 12.6 MB/s eta 0:00:08████▌         | 151.7 MB 17.8 MB/s eta 0:00:04     |███████████████████████         | 154.7 MB 17.8 MB/s eta 0:00:04
[?25hCollecting tensorflow-hub>=0.8.0
  Downloading tensorflow_hub-0.15.0-py2.py3-none-any.whl (85 kB)
[K     |████████████████████████████████| 85 kB 6.8 MB/s  eta 0:00:01
[?25hCollecting gast<=0.4.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting absl-py>=1.0.0
  Downloading 

In [3]:
# Use use the AdamW optimizer from https://github.com/tensorflow/models.
!pip install "tf-models-official==2.13.*"

Collecting tf-models-official==2.13.*
  Downloading tf_models_official-2.13.2-py2.py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 6.5 MB/s eta 0:00:01
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp38-cp38-macosx_10_9_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 12.8 MB/s eta 0:00:01
[?25hCollecting google-api-python-client>=1.6.7
  Downloading google_api_python_client-2.104.0-py2.py3-none-any.whl (12.6 MB)
[K     |████████████████████████████████| 12.6 MB 19.7 MB/s eta 0:00:01
Collecting py-cpuinfo>=3.3.0
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Collecting gin-config
  Downloading gin_config-0.5.0-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 10.9 MB/s eta 0:00:01
Collecting opencv-python-headless
  Downloading opencv_python_headless-4.8.1.78-cp37-abi3-macosx_10_16_x86_64.whl (54.7 MB)
[K     |████████████████████████████████| 54.7 MB 7.6 MB/s eta 0:00:011    |████  

In [1]:
# import nesisary libraries

import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp 
import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [4]:
# Set up dataset directory structure
# This will make it easy to organize and accesss our data in our directory structure


dataset_dir = '../data/amazon_reviews'
os.makedirs(dataset_dir, exist_ok=True)

# Make it easy to access 'train' and 'test' directories inside the dataset directory
train_dir = os.path.join(dataset_dir, 'train')
test_dir = os.path.join(dataset_dir, 'test')

# This will build the 'train' and 'test' directories if they haven't been built yet
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# This will build the folders 1, 2, 3, 4, 5 inside both the train and test directories
one_dir_train = os.path.join(train_dir, '1')
two_dir_train = os.path.join(train_dir, '2')
three_dir_train = os.path.join(train_dir, '3')
four_dir_train = os.path.join(train_dir, '4')
five_dir_train = os.path.join(train_dir, '5')
os.makedirs(one_dir_train, exist_ok=True)
os.makedirs(two_dir_train, exist_ok=True)
os.makedirs(three_dir_train, exist_ok=True)
os.makedirs(four_dir_train, exist_ok=True)
os.makedirs(five_dir_train, exist_ok=True)
one_dir_test = os.path.join(test_dir, '1')
two_dir_test = os.path.join(test_dir, '2')
three_dir_test = os.path.join(test_dir, '3')
four_dir_test = os.path.join(test_dir, '4')
five_dir_test = os.path.join(test_dir, '5')
os.makedirs(one_dir_test, exist_ok=True)
os.makedirs(two_dir_test, exist_ok=True)
os.makedirs(three_dir_test, exist_ok=True)
os.makedirs(four_dir_test, exist_ok=True)
os.makedirs(five_dir_test, exist_ok=True)

In [5]:
# Downloading the dataset
# In order to run this project drop in the .csv data set (called: "Reviews") into the "data" folder
# You can find the data set here:
# https://www.google.com/url?q=https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews&sa=D&source=docs&ust=1695764896933142&usg=AOvVaw1WeATDdUdlTItgNGGldkRF

import pandas as pd

df_extra_params = pd.read_csv("../data/Reviews.csv")
# Only use the Score and Text paramaters
# (Score: 1-5 stars, Text: an amazon review)
df = df_extra_params[["Score", "Text"]]
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [56]:
# Copying the data into our directory structure
# We will seperate the data into the train and test folders
# Inside the train and test folders we have folders 1, 2, 3, 4, 5
# This corresponds to the "Score" of the review

# THIS IS A WORK IN PROGRESS

import shutil
import random

# Seperate all data into 5 dfs for scores 1-5 respectively
df_1 = df.loc[(df["Score"] == 1)]
df_2 = df.loc[(df["Score"] == 2)]
df_3 = df.loc[(df["Score"] == 3)]
df_4 = df.loc[(df["Score"] == 4)]
df_5 = df.loc[(df["Score"] == 5)]

# Splitting data into test and train folders with 50/50 ratio
df_1_train = df_1.sample(frac=0.5, replace=False, random_state=1)
df_1_test = df_1[~df_1.isin(df_1_train)].dropna(how = 'all')

df_2_train = df_2.sample(frac=0.5, replace=False, random_state=1)
df_2_test = df_2[~df_2.isin(df_2_train)].dropna(how = 'all')

df_3_train = df_3.sample(frac=0.5, replace=False, random_state=1)
df_3_test = df_3[~df_3.isin(df_3_train)].dropna(how = 'all')

df_4_train = df_4.sample(frac=0.5, replace=False, random_state=1)
df_4_test = df_4[~df_4.isin(df_4_train)].dropna(how = 'all')

df_5_train = df_5.sample(frac=0.5, replace=False, random_state=1)
df_5_test = df_5[~df_5.isin(df_5_train)].dropna(how = 'all')

df_2_train.head()
print(len(df_3_test))
print(len(df_3_train))

for i in range(0, len(df_1_test)):
    with open(f'test{i}.txt', 'w'):
        pass

for index, row in df_1_test.iterrows():
    path = '../data/amazon_reviews/test/1/my_test.txt'
    with open(path, 'a') as f:
        txt_in = row["Text"]
        f.write(txt_in)

# cwd = os.getcwd()
# path = cwd + "/test"
# df_1_test.to_csv(path)


# for row in df:
#     train_test_iterator += 1
#     review = row['Text']
#     if random.randint(1,10) > 1:
#         if train_test_iterator%10 == 0:
#             if df['Score'] == 1:
#             shutil.copy(review_file, one_dir_train)
#     else: 
    


    
    



21320
21320


KeyboardInterrupt: 

In [20]:
# AUTOTUNE = tf.data.AUTOTUNE
# batch_size = 32
# seed = 42

# raw_train_ds = tf.keras.utils.text_dataset_from_directory(
#     'data/amazon_reviews/train',
#     batch_size=batch_size,
#     validation_split=0.2,
#     subset='training',
#     seed=seed)

# class_names = raw_train_ds.class_names
# train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

# val_ds = tf.keras.utils.text_dataset_from_directory(
#     'aclImdb/train',
#     batch_size=batch_size,
#     validation_split=0.2,
#     subset='validation',
#     seed=seed)

# val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# test_ds = tf.keras.utils.text_dataset_from_directory(
#     'aclImdb/test',
#     batch_size=batch_size)

# test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

NotFoundError: Could not find directory data/amazon_reviews/train