# Introduction

This notebook creates a pickled dataframe with cleaned data.

Preprocessing:
- Transform raw fields
- Exclude duplicate products

# Setup

In [None]:
import pandas as pd
import numpy as np

## Data import

In [None]:
reviews = pd.read_csv('../data/Reviews.csv')

In [None]:
reviews.drop('Id', axis=1, inplace=True)

In [None]:
reviews.info()

In [None]:
reviews.head(2)

## Empty fields

In [None]:
reviews['Summary'].fillna('', inplace=True)

## Duplicates

In [None]:
# some products are duplicated
# group the products and transform the underlying numpy to string
# this will be used as the key for finding duplicated products
# Note: takes a while
review_groups = reviews.groupby('ProductId').apply(lambda x: str(x.drop('ProductId', axis=1).values))

In [None]:
duplicates = review_groups[review_groups.duplicated()].index
print('Duplicated products: {}'.format(len(duplicates)))

In [None]:
# exclude duplicates
reviews = reviews[~reviews['ProductId'].isin(duplicates)]

In [None]:
len(reviews)

## Transform fields

In [None]:
reviews['Time'] = pd.to_datetime(reviews['Time'], unit='s')

# Save cleaned data

In [None]:
reviews.to_pickle(('../data/CleanedReviews.pickle'))