***
# $\mathbf{\text{Amazon Pet Supplies Dataset}}$<br>
***


# Converting data from txt to dataframe

### Data was taken from http://snap.stanford.edu/data/web-Amazon-links.html

### This dataset consists of reviews from Amazon and spans roughly 13 years, from April 2000 to March 2013. Reviews include product and user information, ratings, and a plaintext review.

### The goal of this step is to convert the txt into csv format for easier analysis and model building.

In [51]:
# Import libraries
import gzip
import pandas as pd

In [52]:
# Read and strip the data
data = []
with open('Pet_Supplies.txt') as file:
    for i in file:
        data.append(i.strip())

In [62]:
# Check the current format output of the data
data

['product/productId: B000O1CRYW',
 'product/title: Orbee Tuff Ball Orange - SMALL',
 'product/price: 6.95',
 'review/userId: A2FEQ9XL6ML51C',
 'review/profileName: Just an everyday Dad',
 'review/helpfulness: 1/1',
 'review/score: 5.0',
 'review/time: 1286064000',
 'review/summary: Little Ball, for Little Dogs...',
 'review/text: Great Toy, hard to find! We get ours online here or shipped from a friend in America!Our little Papillion-Yorkie mix loves it. Every night I am in the back garden kicking the ball for him! He destroys tennis balls by chewing off the fluff and our wee dog finds them to big to carry/catch. We stumbled across this in our local petstore and our dog was hooked! Sadly, local pet store no longer imports them.Stars all around for this one!',
 '',
 'product/productId: B000O1CRYW',
 'product/title: Orbee Tuff Ball Orange - SMALL',
 'product/price: 6.95',
 'review/userId: A183LI95B2WNUQ',
 'review/profileName: V. J. Mcmillen "vmcmillen"',
 'review/helpfulness: 1/1',
 're

In [53]:
# Filtering the data by removing the empties ''
filter_data = filter(lambda x: x != "", data)

filter_data = list(filter_data)

In [54]:
# Showing filtering results
filter_data

['product/productId: B000O1CRYW',
 'product/title: Orbee Tuff Ball Orange - SMALL',
 'product/price: 6.95',
 'review/userId: A2FEQ9XL6ML51C',
 'review/profileName: Just an everyday Dad',
 'review/helpfulness: 1/1',
 'review/score: 5.0',
 'review/time: 1286064000',
 'review/summary: Little Ball, for Little Dogs...',
 'review/text: Great Toy, hard to find! We get ours online here or shipped from a friend in America!Our little Papillion-Yorkie mix loves it. Every night I am in the back garden kicking the ball for him! He destroys tennis balls by chewing off the fluff and our wee dog finds them to big to carry/catch. We stumbled across this in our local petstore and our dog was hooked! Sadly, local pet store no longer imports them.Stars all around for this one!',
 'product/productId: B000O1CRYW',
 'product/title: Orbee Tuff Ball Orange - SMALL',
 'product/price: 6.95',
 'review/userId: A183LI95B2WNUQ',
 'review/profileName: V. J. Mcmillen "vmcmillen"',
 'review/helpfulness: 1/1',
 'review/

In [55]:
# Splitting each of lines 
dic = {i.split(':')[0]:[] for i in filter_data}

for l in filter_data:
    dic[l.split(':')[0]].append(l.split(':')[1])

In [56]:
# Create a dataframe
df = pd.DataFrame.from_dict(dic)
df.head()

Unnamed: 0,product/productId,product/title,product/price,review/userId,review/profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,B000O1CRYW,Orbee Tuff Ball Orange - SMALL,6.95,A2FEQ9XL6ML51C,Just an everyday Dad,1/1,5.0,1286064000,"Little Ball, for Little Dogs...","Great Toy, hard to find! We get ours online h..."
1,B000O1CRYW,Orbee Tuff Ball Orange - SMALL,6.95,A183LI95B2WNUQ,"V. J. Mcmillen ""vmcmillen""",1/1,5.0,1230249600,glow ball,I have bought several of these small Orbee Tu...
2,B000O1CRYW,Orbee Tuff Ball Orange - SMALL,6.95,A1LSSENM0XIQQR,gcoronado4,0/0,3.0,1309046400,Too Big,It is a quality ball but the small is still t...
3,B000O1CRYW,Orbee Tuff Ball Orange - SMALL,6.95,A2E5PZE1PZVK38,jerry,0/0,5.0,1308873600,no good,I gave it 5 stars because my little dog had s...
4,B0002ARHAE,Kent Marine Pro-Clear Freshwater Clarifier,3.73,A3PXLJE4OPIQTY,"M. Thomas ""sea_anemone""",0/0,5.0,1356912000,Best clarifier ever,I've used many products to try and help the w...


In [57]:
# Taking the text after '/' for columns name
for i in df.columns:
    df[i] = df[i].apply(str.strip)
    
dic = {}

for i in df.columns:
    dic[i] = i.split('/')[1]

In [58]:
# Rename columns with splitting result
df.rename(columns=dic, inplace=True)

In [61]:
df.head()

Unnamed: 0,productId,title,price,userId,profileName,helpfulness,score,time,summary,text
0,B000O1CRYW,Orbee Tuff Ball Orange - SMALL,6.95,A2FEQ9XL6ML51C,Just an everyday Dad,1/1,5.0,1286064000,"Little Ball, for Little Dogs...","Great Toy, hard to find! We get ours online h..."
1,B000O1CRYW,Orbee Tuff Ball Orange - SMALL,6.95,A183LI95B2WNUQ,"V. J. Mcmillen ""vmcmillen""",1/1,5.0,1230249600,glow ball,I have bought several of these small Orbee Tu...
2,B000O1CRYW,Orbee Tuff Ball Orange - SMALL,6.95,A1LSSENM0XIQQR,gcoronado4,0/0,3.0,1309046400,Too Big,It is a quality ball but the small is still t...
3,B000O1CRYW,Orbee Tuff Ball Orange - SMALL,6.95,A2E5PZE1PZVK38,jerry,0/0,5.0,1308873600,no good,I gave it 5 stars because my little dog had s...
4,B0002ARHAE,Kent Marine Pro-Clear Freshwater Clarifier,3.73,A3PXLJE4OPIQTY,"M. Thomas ""sea_anemone""",0/0,5.0,1356912000,Best clarifier ever,I've used many products to try and help the w...


In [60]:
# Check dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217170 entries, 0 to 217169
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   productId    217170 non-null  object
 1   title        217170 non-null  object
 2   price        217170 non-null  object
 3   userId       217170 non-null  object
 4   profileName  217170 non-null  object
 5   helpfulness  217170 non-null  object
 6   score        217170 non-null  object
 7   time         217170 non-null  object
 8   summary      217170 non-null  object
 9   text         217170 non-null  object
dtypes: object(10)
memory usage: 16.6+ MB


In [67]:
# Save file to csv
df.to_csv('Pet_Supplies.csv', index=None)