In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import path
import re
from sklearn.preprocessing import OrdinalEncoder
import pickle

In [2]:
root_folder = "./data"
train_file = path.join(root_folder,"train_meta.pkl")

In [3]:
original_dataset = pd.read_pickle(train_file)
original_dataset.head()

Unnamed: 0,name,lang,access,agent,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,...,2017-09-01,2017-09-02,2017-09-03,2017-09-04,2017-09-05,2017-09-06,2017-09-07,2017-09-08,2017-09-09,2017-09-10
0,588.0,7.0,0.0,1.0,18.0,11.0,5.0,13.0,14.0,9.0,...,19.0,33.0,33.0,18.0,16.0,27.0,29.0,23.0,54.0,38.0
1,589.0,7.0,0.0,1.0,11.0,14.0,15.0,18.0,11.0,13.0,...,32.0,30.0,11.0,19.0,54.0,25.0,26.0,23.0,13.0,81.0
2,643.0,7.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,4.0,...,6.0,6.0,7.0,2.0,4.0,7.0,3.0,4.0,7.0,6.0
3,683.0,7.0,0.0,1.0,35.0,13.0,10.0,94.0,4.0,26.0,...,7.0,19.0,19.0,9.0,6.0,16.0,19.0,30.0,38.0,4.0
4,699.0,7.0,0.0,1.0,12.0,7.0,4.0,5.0,20.0,8.0,...,13.0,13.0,45.0,4.0,13.0,20.0,18.0,17.0,14.0,11.0


In [4]:
original_dataset['median'] = original_dataset.iloc[:,4:].median(axis=1)
original_dataset.head()

Unnamed: 0,name,lang,access,agent,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,...,2017-09-02,2017-09-03,2017-09-04,2017-09-05,2017-09-06,2017-09-07,2017-09-08,2017-09-09,2017-09-10,median
0,588.0,7.0,0.0,1.0,18.0,11.0,5.0,13.0,14.0,9.0,...,33.0,33.0,18.0,16.0,27.0,29.0,23.0,54.0,38.0,18.0
1,589.0,7.0,0.0,1.0,11.0,14.0,15.0,18.0,11.0,13.0,...,30.0,11.0,19.0,54.0,25.0,26.0,23.0,13.0,81.0,19.0
2,643.0,7.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,4.0,...,6.0,7.0,2.0,4.0,7.0,3.0,4.0,7.0,6.0,3.0
3,683.0,7.0,0.0,1.0,35.0,13.0,10.0,94.0,4.0,26.0,...,19.0,19.0,9.0,6.0,16.0,19.0,30.0,38.0,4.0,14.0
4,699.0,7.0,0.0,1.0,12.0,7.0,4.0,5.0,20.0,8.0,...,13.0,45.0,4.0,13.0,20.0,18.0,17.0,14.0,11.0,15.0


In [5]:
melted = original_dataset.melt(id_vars=['name','lang','access','agent', 'median'],var_name= 'date', value_name='hits')
melted.head()

Unnamed: 0,name,lang,access,agent,median,date,hits
0,588.0,7.0,0.0,1.0,18.0,2015-07-01,18.0
1,589.0,7.0,0.0,1.0,19.0,2015-07-01,11.0
2,643.0,7.0,0.0,1.0,3.0,2015-07-01,1.0
3,683.0,7.0,0.0,1.0,14.0,2015-07-01,35.0
4,699.0,7.0,0.0,1.0,15.0,2015-07-01,12.0


In [6]:
melted['date'] = pd.to_datetime(melted['date'])
melted.head()

Unnamed: 0,name,lang,access,agent,median,date,hits
0,588.0,7.0,0.0,1.0,18.0,2015-07-01,18.0
1,589.0,7.0,0.0,1.0,19.0,2015-07-01,11.0
2,643.0,7.0,0.0,1.0,3.0,2015-07-01,1.0
3,683.0,7.0,0.0,1.0,14.0,2015-07-01,35.0
4,699.0,7.0,0.0,1.0,15.0,2015-07-01,12.0


In [7]:
def extract_date_features(df):
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['day_of_year'] = df['date'].dt.dayofyear
    df['quarter'] = df['date'].dt.quarter
    df['week_of_year'] = df['date'].dt.weekofyear

In [8]:
%%time
extract_date_features(melted)

CPU times: user 18.4 s, sys: 4.9 s, total: 23.3 s
Wall time: 23.3 s


In [9]:
def extract_lag_features(df):
    df['last_hits'] = df.groupby(['name','lang','access','agent'])['hits'].shift()
    df['last_diff'] = df.groupby(['name','lang','access','agent'])['last_hits'].diff()
    df.dropna(inplace=True)

In [10]:
%%time
extract_lag_features(melted)

CPU times: user 2min 52s, sys: 1min, total: 3min 52s
Wall time: 3min 52s


In [11]:
melted.head()

Unnamed: 0,name,lang,access,agent,median,date,hits,day_of_week,month,day_of_year,quarter,week_of_year,last_hits,last_diff
126253,4052.0,5.0,0.0,1.0,9.0,2015-07-02,6.0,3,7,183,3,27,4.0,0.0
126737,14413.0,5.0,0.0,1.0,14275.0,2015-07-02,1121.0,3,7,183,3,27,19.0,-1311.0
126752,20018.0,5.0,0.0,1.0,498.0,2015-07-02,385.0,3,7,183,3,27,451.0,250.0
126754,20021.0,5.0,0.0,1.0,38.0,2015-07-02,71.0,3,7,183,3,27,80.0,37.0
126755,20024.0,5.0,0.0,1.0,50.0,2015-07-02,55.0,3,7,183,3,27,38.0,23.0


In [12]:
melted.to_pickle(path.join(root_folder, "train_features.pkl"))