In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
raw_dtypes = {
    'fk_customer': 'object',
    'channel': 'object',
    'partner': 'object',
    'device': 'object',
    'first_sale_number': 'object',
    'age': 'int',
    'gender': 'object',
    'state': 'object',
    'expected_delivery_date': 'object',
    'delivered_date': 'object',
    'first_sale_date': 'object',
    'second_sale_date': 'object',
    'has_marketplace': 'object',
    'has_crossdocking': 'object',
    'has_private_label': 'object',
    'has_brands': 'object',
    'gmv': 'float',
}

__dates_columns = [
    'first_sale_date',
    'second_sale_date',
    'delivered_date',
    'expected_delivery_date',
]

__training_dtypes = {
    'fk_customer': 'int',
    'channel': 'object',
    'partner': 'object',
    'device': 'object',
    'age': 'int',
    'gender': 'object',
    'state': 'object',
    'has_marketplace': 'bool',
    'has_crossdocking': 'bool',
    'has_private_label': 'bool',
    'has_brands': 'bool',
    'gmv': 'float',
    'days_since_last_bf': 'int',
    'waiting_time': 'int',
    'has_second_purchase': 'bool',
}

feature_columns = list(__training_dtypes.keys())[1:-2]

df = dd.read_parquet('../ml/input/data/raw/data.parquet', engine='pyarrow').compute()
df.columns = list(raw_dtypes.keys())
df = df.astype(raw_dtypes)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
from datetime import datetime, timedelta

n_rows_original = df.shape[0]
print("Number of rows" , n_rows_original)

print("Nan in each columns" , df.isnull().sum(), sep='\n')

In [None]:
from pandas import to_datetime

n_rows_original = df.shape[0]
df.second_sale_date = df.second_sale_date.fillna((datetime.today() + timedelta(days=365)).strftime("%Y%m%d"))
df = df.dropna()

print("Number of rows removed" , n_rows_original-df.shape[0])

for col in __dates_columns:
    if col in df.columns:
        df[col] = to_datetime(
            df[col],
            format='%Y%m%d',
            errors='coerce'
        )

In [None]:
from unidecode import unidecode
import pandas as pd
from pandas import to_datetime


training = True
seconds_per_year = 86400.0

def fix_state_information(state_series):
    return (
        state_series.apply(unidecode)
        .str.replace("b'", "")
        .str.replace("'", "")
        .str.replace(" ", "_")
    )

def get_last_black_friday_date(date):
    black_friday = {
        2014: to_datetime("2014-11-28"),
        2015: to_datetime("2015-11-27"),
        2016: to_datetime("2016-11-25"),
        2017: to_datetime("2017-11-24"),
        2018: to_datetime("2018-11-23"),
        2019: to_datetime("2019-11-29"),
    }
    current_year_bf = black_friday[date.year]
    last_year_bf = black_friday[date.year - 1]
    return last_year_bf if date <= current_year_bf else current_year_bf

def get_days_since_black_friday(date_series):
    last_bf_date = date_series.apply(get_last_black_friday_date)
    return (date_series - last_bf_date).dt.total_seconds().div(seconds_per_year)

X = (
    df.assign(state=lambda df: fix_state_information(df.state))
    .assign(days_since_last_bf=lambda df: get_days_since_black_friday(df.first_sale_date))
    .loc[:, feature_columns]
)

if training:
    waiting_time = (
        (df.second_sale_date - df.first_sale_date)
        .dt.total_seconds()
        .div(seconds_per_year)
    )
    has_second_purchase = ~waiting_time.isnull()
    y = pd.DataFrame({"waiting_time": waiting_time, "has_second_purchase": has_second_purchase})
else:
    y = None
    

In [None]:
X

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
sns.distplot(df.age);

In [None]:
sns.distplot(df.gmv);

In [None]:
df.groupby('class').hist()

In [None]:
data.groupby('class').plas.hist(alpha=0.4)

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(data, alpha=0.2, figsize=(6, 6), diagonal='kde')