In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
sales = pd.read_csv('../data/raw/sales_train.csv')
shops = pd.read_csv('../data/raw/shops.csv')
items = pd.read_csv('../data/raw/items.csv')
item_cat = pd.read_csv('../data/raw/item_categories.csv')
test = pd.read_csv('../data/raw/test.csv')

### Deduplicating

In [3]:
sales.drop_duplicates(keep='first', inplace=True)

### Dropping negative prices and counts

In [4]:
sales = sales[sales['item_price'] > 0]
sales = sales[sales['item_cnt_day'] > 0]

### Dealing with outliars

Some on them have an explanation, some of them not so. I decided against removing outliars, because 
1) some *are* real samples and we may need to predict targets that are out of the ordinary

2) percentage of outliars is so small that I doubt it would influence out model's quality anyway

### Possible repetitions in shops:

Жуковский ул. Чкалова 39м? (*id 10*) - Жуковский ул. Чкалова 39м² (*id 11*)

РостовНаДону ТРК "Мегацентр Горизонт" (*id 39*) - РостовНаДону ТРК "Мегацентр Горизонт" Островной (*id 40*)

!Якутск Орджоникидзе, 56 фран (*id 0*) - Якутск Орджоникидзе, 56 (*id 57*)

!Якутск ТЦ "Центральный" фран (*id 1*) - Якутск ТЦ "Центральный" (*id 58*)

I suggest remapping sales' attribute `shop_id` for samples to reference only one shop of the pair above.

In [5]:
shops_mapping = {10: 11, 40: 39, 0: 57, 1: 58}
sales['shop_id'].replace(shops_mapping, inplace=True)

### Special characters in item names

In [6]:
items.head(5)

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [7]:
items['item_name'] = items['item_name'].apply(lambda name: re.sub('^[\\\/^.*\[\]~!@#$%^&()_+={}|\:;“’<,>?฿]+', '', name))

In [8]:
items.head(5)

Unnamed: 0,item_name,item_id,item_category_id
0,ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,ABBYY FineReader 12 Professional Edition Full ...,1,76
2,В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,КОРОБКА (СТЕКЛО) D,4,40


In [9]:
items.duplicated(subset='item_name').any()

True

In [10]:
items_mapping = {12: 14690}
sales['item_id'].replace(items_mapping, inplace=True)

In [11]:
test['item_id'].replace(items_mapping, inplace=True)
test['shop_id'].replace(shops_mapping, inplace=True)

In [12]:
test.duplicated(subset=['item_id','shop_id']).any()

False

`item_categories` is clean.

### Saving data after ETL to intermediate storage

In [14]:
sales['date'] = pd.to_datetime(sales['date'], format='%Y-%m-%d')
sales.to_csv('../data/interim/sales_train_etl.csv', index=False, date_format='%Y-%m-%d')
shops.to_csv('../data/interim/shops_etl.csv', index=False)
items.to_csv('../data/interim/items_etl.csv', index=False)
item_cat.to_csv('../data/interim/item_categories_etl.csv', index=False)
test.to_csv('../data/interim/test_etl.csv')