In [1]:
import numpy as np
import pandas as pd
import json
import ast

In [2]:
category_list = [
    'appliances',
    'arts_crafts_and_sewing',
    'automotive',
    'baby',
    'beauty',
    'cell_phones_and_accessories',
    'clothing_shoes_and_jewelry',
    'electronics',
    'grocery_and_gourmet_food',
    'health_and_personal_care',
    'home_and_kitchen',
    'industrial_and_scientific',
    'musical_instruments',
    'office_products',
    'patio_lawn_and_garden',
    'pet_supplies',
    'software',
    'sports_and_outdoors',
    'tools_and_home_improvement',
    'toys_and_games',
    'video_games',
]
columns = [
    'answer',
    'answerTime',
    'answerType',
    'asin',
    'question',
    'questionType',
    'unixTime'
]

df = pd.DataFrame(columns=columns)

for category in category_list:
    tmp_df = pd.read_json(f"../datasets/json/{category}.json")
    tmp_df['category'] = category
    df = df.append(tmp_df, sort=False)
    
df.head()

Unnamed: 0,answer,answerTime,answerType,asin,question,questionType,unixTime,category
0,I replaced my old one with this without a hitch.,"Jun 27, 2014",?,B00004U9JP,I have a 9 year old Badger 1 that needs replac...,yes/no,1403852000.0,appliances
1,This may help InSinkErator Model BADGER-1: Bad...,"Apr 28, 2014",,B00004U9JP,model number,open-ended,1398668000.0,appliances
2,Plumbing connections will vary with different ...,"Aug 25, 2014",?,B00004U9JP,can I replace Badger 1 1/3 with a Badger 5 1/2...,yes/no,1408950000.0,appliances
3,It does not come with a power cord. It does co...,"Nov 3, 2014",?,B00004U9JP,Does this come with power cord and dishwasher ...,yes/no,1415002000.0,appliances
4,Check if you dropped something inside.Usually ...,"Jun 21, 2014",,B00004U9JP,loud noise inside when turned on. sounds like ...,open-ended,1403334000.0,appliances


In [3]:
df['combined_text'] = df['question'] + ' ' + df['answer']

In [4]:
df = df[['category', 'combined_text', 'question', 'answer', 'questionType', 'asin']]
df.rename(columns={'questionType': 'question_type'}, inplace=True)

In [5]:
df.duplicated().astype(int).sum()

13082

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.reset_index(inplace=True, drop=True)

In [8]:
df.shape

(1383814, 6)

In [9]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1383814 entries, 0 to 1383813
Data columns (total 6 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   category       1383814 non-null  object
 1   combined_text  1383814 non-null  object
 2   question       1383814 non-null  object
 3   answer         1383814 non-null  object
 4   question_type  1383814 non-null  object
 5   asin           1383814 non-null  object
dtypes: object(6)
memory usage: 63.3+ MB


In [10]:
print(f"Total number of products: {len(df.asin.unique())}")
print(f"Total number of duplicated questions: {df.question.duplicated().astype(int).sum()}")
print(f"Total number of duplicated answers: {df.answer.duplicated().astype(int).sum()}")
print(f"Total number of duplicated full_text: {df.combined_text.duplicated().astype(int).sum()}")

Total number of products: 182022
Total number of duplicated questions: 184439
Total number of duplicated answers: 221756
Total number of duplicated full_text: 112805


In [11]:
df.category.value_counts()

electronics                    313286
home_and_kitchen               184397
sports_and_outdoors            146547
tools_and_home_improvement     101052
automotive                      89497
health_and_personal_care        80465
cell_phones_and_accessories     80020
patio_lawn_and_garden           59576
toys_and_games                  51355
office_products                 43438
beauty                          42417
pet_supplies                    36602
baby                            28929
musical_instruments             23292
clothing_shoes_and_jewelry      21960
arts_crafts_and_sewing          21261
grocery_and_gourmet_food        19533
industrial_and_scientific       12134
video_games                      9998
software                         9047
appliances                       9008
Name: category, dtype: int64

In [12]:
df.question_type.unique()

array(['yes/no', 'open-ended'], dtype=object)

In [13]:
df['is_yn'] = [1 if x == 'yes/no' else 0 for x in df['question_type']]
df.drop(columns=['question_type'], inplace=True)

In [14]:
df.head()

Unnamed: 0,category,combined_text,question,answer,asin,is_yn
0,appliances,I have a 9 year old Badger 1 that needs replac...,I have a 9 year old Badger 1 that needs replac...,I replaced my old one with this without a hitch.,B00004U9JP,1
1,appliances,model number This may help InSinkErator Model ...,model number,This may help InSinkErator Model BADGER-1: Bad...,B00004U9JP,0
2,appliances,can I replace Badger 1 1/3 with a Badger 5 1/2...,can I replace Badger 1 1/3 with a Badger 5 1/2...,Plumbing connections will vary with different ...,B00004U9JP,1
3,appliances,Does this come with power cord and dishwasher ...,Does this come with power cord and dishwasher ...,It does not come with a power cord. It does co...,B00004U9JP,1
4,appliances,loud noise inside when turned on. sounds like ...,loud noise inside when turned on. sounds like ...,Check if you dropped something inside.Usually ...,B00004U9JP,0


In [25]:
df.category.isnull().sum()

0

In [15]:
combined_text = df[['combined_text', 'category', 'asin']].copy()
combined_text.to_csv('../datasets/cleaned/combined_text.csv', index=False)

In [16]:
qa = df[['question', 'answer', 'category', 'asin', 'is_yn']].copy()
qa.to_csv('../datasets/cleaned/question_answer.csv', index=False)

In [17]:
df.to_csv('../datasets/cleaned/full_dataset.csv', index=False)