In [None]:
# Problem statement for Large Language Model
# The user review data has to be downloaded from the given repository
# A Model has to be built using Basic steps, which can help predict the Rating for given Review
# The embeddings can be used for any of the provider, like Universal Sentence Encoder or OpenAI
# The review data handling, processing the embeddings, Building the Model should be done in a way:
#     : the usage of Memory is low
#     : the Model can be built and run with given basic resources of google colab notebook, and without deploying any high end server
#     : the time taken in different steps should not overshoot and become unacceptable
#     : using Batch processing, iteratively is preferred
#     : using cache is preferred
#     : the Model should be saved for future, and not to be built again
#     : the embeddings also should be saved, to avoid downloading again, saving cost

In [None]:
import pandas as pd
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# File paths
file_path = '/content/drive/My Drive/CloudxLab/amazon_review_small.txt'
file_prefix = '/content/drive/MyDrive/split_file_part_'
file_suffix = '.txt'

# Splitting the file into smaller chunks
chunk_size = 20000
chunks = pd.read_csv(file_path, chunksize=chunk_size, encoding='utf-8')

file_counter = 1
for chunk in chunks:
    split_file_name = f"{file_prefix}{file_counter}{file_suffix}"
    chunk.to_csv(split_file_name, index=False, encoding='utf-8')
    file_counter += 1

print(f"Split files are created with prefix: {file_prefix}")
print(f"Total files created: {file_counter - 1}")

# Loading the split files into a list of DataFrames and concatenating them
data = []
for i in range(1, file_counter):
    file_name = f"{file_prefix}{i}{file_suffix}"
    df_chunk = pd.read_csv(file_name, encoding='utf-8')
    data.append(df_chunk)
    print(f"Read file: {file_name} with {len(df_chunk)} records")

# Combine all chunks into a single DataFrame
df = pd.concat(data, ignore_index=True)

print(f"Total records after concatenation: {len(df)}")

Mounted at /content/drive
Split files are created with prefix: /content/drive/MyDrive/split_file_part_
Total files created: 33
Read file: /content/drive/MyDrive/split_file_part_1.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_part_2.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_part_3.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_part_4.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_part_5.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_part_6.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_part_7.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_part_8.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_part_9.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_part_10.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_part_11.txt with 20000 records
Read file: /content/drive/MyDrive/split_file_p

In [None]:
df.head()

Unnamed: 0,1,mens ultrasheer,"This model may be ok for sedentary types, but I'm active and get around alot in my job - consistently found these stockings rolled up down by my ankles! Not Good!! Solution: go with the standard compression stocking, 20-30, stock #114622. Excellent support, stays up and gives me what I need. Both pair of these also tore as I struggled to pull them up all the time. Good riddance/bad investment!"
0,4,Surprisingly delightful,This is a fast read filled with unexpected hum...
1,2,"Works, but not as advertised",I bought one of these chargers..the instructio...
2,2,Oh dear,I was excited to find a book ostensibly about ...
3,2,Incorrect disc!,"I am a big JVC fan, but I do not like this mod..."
4,2,Incorrect Disc,"I love the style of this, but after a couple y..."


In [None]:
df.shape

(649999, 3)

In [None]:
# Another option to get the embeddings and handle the review data to build the model
# the input file needs to be processed using hdfs, Spark in batches and keep storing the data into Parquet table
# high level steps to be followed:

# 1) load the libraries
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col
# import pyarrow as pa
# import pyarrow.parquet as pq

# 2) Spark session
# spark = SparkSession.builder \
#     .appName("Generate Embeddings") \
#     .getOrCreate()

# 3) Load the Universal Sentence Encoder
#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# 4) get the embeddings for a batch
# def get_embeddings(reviews):
#     """Get embeddings for a batch of reviews."""
#     return embed(reviews).numpy()

# 5) function for the batches and keep saving


# 6) we need to define the paths for input and output files

# 7) itertate over the file, and keep adding into Parquet table at the output path

# stop Spark session

In [None]:
# taken from USE Embedding, sentence encoder
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

# Load the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Get embeddings
def get_embeddings(reviews):
    return embed(reviews).numpy()

In [None]:
col = ["rating", "review_title", "review_description"]
df.columns = col
df.columns

Index(['rating', 'review_title', 'review_description'], dtype='object')

In [None]:
df.shape

(649999, 3)

In [None]:
# get the entire data of 3rd column
#reviews = list(df.iloc[:,[2]])
reviews = df['review_description'].values
type(reviews)

numpy.ndarray

In [None]:
reviews.shape

(649999,)

In [None]:
# get the ratings loaded to list
# ratings = df[0][:20000].values
# ratings[0:2]

ratings = df["rating"].values
ratings.shape

(649999,)

In [None]:
# Function to batch process embeddings
# def batch_process_embeddings(texts, batch_size=1000):
#     embeddings = []
#     for i in range(0, len(texts), batch_size):
#         batch_texts = texts[i:i + batch_size]
#         batch_embeddings = embed(batch_texts)
#         embeddings.append(batch_embeddings.numpy())
#     return np.vstack(embeddings)



In [None]:
# embeddings = batch_process_embeddings(reviews, batch_size=1000)

In [None]:
len(reviews)

649999

In [None]:
import csv
def batch_process_embeddings_to_csv(texts, batch_size=1000, output_file='/content/drive/My Drive/CloudxLab/embeddings.csv'):
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)

        # Optionally, write a header row if needed
        writer.writerow([f'embedding_{i}' for i in range(512)])

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = embed(batch_texts).numpy()

In [None]:
def batch_process_embeddings_to_df(texts, ratings, batch_size=1000):
    df = pd.DataFrame()

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_ratings = ratings[i:i + batch_size]
        batch_embeddings = embed(batch_texts).numpy()

        # Create a DataFrame for the current batch
        batch_df = pd.DataFrame(batch_embeddings)
        batch_df['rating'] = batch_ratings

        # Append to the main DataFrame
        df = pd.concat([df, batch_df], ignore_index=True)

    return df

In [None]:
# Process in batches and get the DataFrame
df = batch_process_embeddings_to_df(reviews, ratings, batch_size=1000)

In [None]:
# Save the DataFrame to a CSV file
df.to_csv('/content/drive/My Drive/CloudxLab/embeddings.csv', index=False)

In [None]:
batch_process_embeddings_to_csv(reviews, batch_size=1000)

In [None]:
ls -lrt drive/MyDrive/CloudxLab

total 4260181
-rw------- 1 root root  292412008 Aug  3 14:01 amazon_review_small.txt
-rw------- 1 root root 4070011931 Aug  9 16:23 embeddings.csv
