In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import sys
import multiprocessing as mp

from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.externals import joblib
from sklearn import metrics
import lightgbm as lgb

sys.path.append('../')
from pipelines import main
import pipeline_config as cfg

DATA_DIR = ''

In [None]:
main_train = pd.read_csv(os.path.join(DATA_DIR,'train.csv'), nrows=100)
periods_train = pd.read_csv(os.path.join(DATA_DIR,'periods_train.csv'),nrows=10000)

In [None]:
main_train.head()

In [None]:
periods_train.head()

In [None]:
TIMESTAMP_COLUMNS = ['activation_date','date_from','date_to']
for col in TIMESTAMP_COLUMNS:
    periods_train[col] = pd.to_datetime(periods_train[col], format='%Y-%m-%d')

In [None]:
periods_train.head()

In [None]:
def extract_period_features(df):
    df.sort_values('activation_date', ascending=False, inplace=True)
    df['duration'] = (df['date_to'] - df['date_from']).dt.days
    df['activation_to_start_delta'] = (df['date_from'] - df['activation_date']).dt.days
    df['date_to_last'] = df.shift(periods=-1)['date_to']
    df['time_from_last_offer_ended'] = (df['date_from'] - df['date_to_last']).dt.days

    cols, aggs = ['duration', 'activation_to_start_delta','time_from_last_offer_ended'], ['mean','median','max','min','std']
    df_stat = df[cols].apply(aggs).reset_index(drop=True)
    colnames = ['{}_{}'.format(col, agg) for agg in aggs for col in cols ]
    df_feat = pd.DataFrame(np.ndarray.reshape(df_stat.values, (1,15)), columns=colnames)
    df_feat['offer_count'] = df.shape[0]
    df_feat['last_offer_activation_date'] = pd.to_datetime(df['date_to'].max(), format='%Y-%m-%d')
    df_feat['item_id'] = df.iloc[0]['item_id']
    df_feat.fillna(0,inplace=True)
    return df_feat.iloc[0]

In [None]:
%%time

feat = periods_train.groupby('item_id').apply(extract_period_features).reset_index()

In [None]:
%%time

groups = periods_train.groupby('item_id')

def chunk_groups(groupby_object, chunk_size):
    n_groups = groupby_object.ngroups
    group_chunk = []
    for i, (_, df) in enumerate(groupby_object):
        group_chunk.append(df)
        if (i+1)%chunk_size == 0 or i+1 == n_groups:
            group_chunk_ = group_chunk.copy()
            group_chunk = []
            yield group_chunk_
            
def parallel_apply(groups, func, num_workers, chunk_size=100):
    n_chunks = np.ceil(1.0*groups.ngroups/chunk_size)
    features = []
    for groups_chunk in tqdm(chunk_groups(groups, chunk_size), total=n_chunks):
        with mp.pool.Pool(10) as executor:
            features_chunk = executor.map(func, groups_chunk)
        features.extend(features_chunk)
    
    features = pd.DataFrame(features)
    return features

features = parallel_apply(groups, extract_period_features, 10, 10000)

In [None]:
features.shape

In [None]:
features.head()

In [None]:
main_features = main_train[['item_id','activation_date']]

In [None]:
features.shape, main_features.shape

In [None]:
f_set = set(features['item_id'].unique())
main_set = set(main_features['item_id'].unique())

In [None]:
len(main_set & f_set)

In [None]:
full_features = pd.merge(main_features, features, on='item_id')
full_features.head()