In [2]:
from sagemaker import get_execution_role #this contains the permissions to access the bucket
import boto3 #aws python client
import pandas as pd
import io
s3 = boto3.client('s3') #connect to S3
bucketName = "cs539-e2023-149b5d57-81a4-4e31-a476-c81e7e07dc33"
live_split_prefix = 'split/tsv/reviews/'
train_split_prefix = 'split/tsv/training_reviews/'

In [3]:
def readTsvDataset(filename):
    key = 'datasets/tsv/' + filename;
    response = s3.get_object(Bucket=bucketName, Key=key)
    df = pd.read_csv(response.get('Body'), sep='\t')
    return df

def writeChunk(chunk_df, prefix, chunk_index):
    chunk_id = "{:03d}".format(chunk_index)
    writeKey = prefix + 'chunk_' + str(chunk_id) + '.tsv'
    with io.StringIO() as tsv_buffer: #buffer to write the data
        chunk_df.to_csv(tsv_buffer, index=False, sep='\t')
        s3.put_object(Bucket=bucketName, Key=writeKey, Body=tsv_buffer.getvalue())

In [4]:
import re
REGEX_WHITESPACE = re.compile(r"\s+")
REGEX_INVALID_CHARS = re.compile(r'[,\-]')

def cleanText(text):
    #remove newlines and tabs and spaces
    text = str(text)
    text = REGEX_WHITESPACE.sub(' ', text)
    text = REGEX_INVALID_CHARS.sub('',text)
    return text
    
def getType(val):
    return type(val).__name__

In [13]:
df = readTsvDataset('yelp_academic_dataset_review.tsv')
#df = readTsvDataset('training_reviews_clean.tsv')
df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,If you decide to eat here just be aware it is ...,2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,Wow! Yummy different delicious. Our favorite i...,2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...,...
6990275,H0RIamZu0B0Ei0P4aeh3sQ,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5,1,2,1,Latest addition to services from ICCU is Apple...,2014-12-17 21:45:20
6990276,shTPgbgdwTHSuU67mGCmZQ,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5,2,1,2,This spot offers a great affordable east weeke...,2021-03-31 16:55:10
6990277,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30
6990278,i-I4ZOhoX70Nw5H0FwrQUA,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5,1,0,0,For when I'm feeling like ignoring my caloriec...,2022-01-19 18:59:27


In [14]:
df['text'] = df['text'].map(cleanText)

In [15]:
import uuid
df['review_id'] = [uuid.uuid4() for _ in range(len(df))]

In [12]:
chunk_size = 10000
chunks = []
num_chunks = int(len(df) / chunk_size)+1
for i in range(0, num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    chunk = df[start:end]
    #reset row index
    chunk = chunk.reset_index(drop=True)
    trimmed = chunk[['review_id', 'text']]
    writeChunk(chunk, live_split_prefix + 'full/', i)
    writeChunk(trimmed, live_split_prefix + 'trimmed/', i)