---
### Ref. http://francescopochetti.com/pytorch-for-tabular-data-predicting-nyc-taxi-fares/
---

### Load Library

In [1]:
%matplotlib inline
import pathlib
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 8, 6
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', 500)
from collections import defaultdict

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error

pd.options.mode.chained_assignment = None

from torch.nn import init
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils import data
from torch.optim import lr_scheduler

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')
import os

### Load Data Set

In [132]:
names = ['key','fare_amount','pickup_datetime','pickup_longitude',
         'pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count']

curr_path = os.getcwd()
curr_path

'/home/colin/work/ny_taxi'

In [133]:
# data Set is on curr_path/data directory
df = pd.read_csv(os.path.join(curr_path,'data','train.csv'), header=None, names = names)
df = df.iloc[1:,]

df.shape
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


(14012936, 8)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
1,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
2,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
3,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
4,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
5,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [135]:
# To handle dataset and modeling, sampling of total data set
df_sample = df.sample(frac = 0.05, replace = False, random_state=777).reset_index(drop=True)
df_sample.shape
df_sample.head()

df_sample.to_csv(os.path.join(curr_path,'data','train_sample.csv'))

(700647, 8)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-02-27 00:40:00.000000183,35.3,2011-02-27 00:40:00 UTC,-73.9811,40.7743,-74.0022,40.6219,1
1,2010-04-10 18:57:00.000000184,10.1,2010-04-10 18:57:00 UTC,-73.9559,40.7141,-73.9854,40.6889,2
2,2013-10-06 17:13:30.0000001,4.0,2013-10-06 17:13:30 UTC,-73.9766,40.7857,-73.971,40.7932,1
3,2009-01-21 19:53:00.000000116,5.7,2009-01-21 19:53:00 UTC,-73.999,40.724,-74.0105,40.7187,5
4,2010-10-16 00:39:00.000000188,18.1,2010-10-16 00:39:00 UTC,-73.9827,40.7387,-73.9705,40.7884,2


### Data Preprocessing

In [136]:
df = pd.read_csv(os.path.join(curr_path,'data','train_sample.csv'), header=0)
df = df.iloc[:,1:]

df.shape
df.head()

(700647, 8)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-02-27 00:40:00.000000183,35.3,2011-02-27 00:40:00 UTC,-73.981075,40.774302,-74.002175,40.62195,1.0
1,2010-04-10 18:57:00.000000184,10.1,2010-04-10 18:57:00 UTC,-73.955883,40.71409,-73.985445,40.688862,2.0
2,2013-10-06 17:13:30.0000001,4.0,2013-10-06 17:13:30 UTC,-73.976565,40.785653,-73.97099,40.793163,1.0
3,2009-01-21 19:53:00.000000116,5.7,2009-01-21 19:53:00 UTC,-73.999023,40.723962,-74.01055,40.718733,5.0
4,2010-10-16 00:39:00.000000188,18.1,2010-10-16 00:39:00 UTC,-73.98274,40.738695,-73.970475,40.788387,2.0


In [137]:
df.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      float64
dtype: object

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700647 entries, 0 to 700646
Data columns (total 8 columns):
key                  700647 non-null object
fare_amount          700647 non-null float64
pickup_datetime      700647 non-null object
pickup_longitude     700647 non-null float64
pickup_latitude      700647 non-null float64
dropoff_longitude    700646 non-null float64
dropoff_latitude     700646 non-null float64
passenger_count      700647 non-null float64
dtypes: float64(6), object(2)
memory usage: 42.8+ MB


In [139]:
df.passenger_count.describe()
df.passenger_count.quantile([.85, .99])

count    700647.000000
mean          1.685271
std           1.332619
min           0.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         208.000000
Name: passenger_count, dtype: float64

0.85    3.0
0.99    6.0
Name: passenger_count, dtype: float64

In [140]:
df.fare_amount.describe()
df.fare_amount.quantile([.85, .99])

count    700647.000000
mean         11.342687
std           9.836628
min         -52.000000
25%           6.000000
50%           8.500000
75%          12.500000
max         500.000000
Name: fare_amount, dtype: float64

0.85    16.5
0.99    53.0
Name: fare_amount, dtype: float64

#### 1) split pickup_datetime into small parts

In [141]:
col = 'pickup_datetime'
df[col] = pd.to_datetime(df.pickup_datetime, infer_datetime_format=True)
prefix = 'pickup'

In [142]:
attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
        'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
attr = attr + ['Hour', 'Minute', 'Second']

In [143]:
for n in attr: df[prefix + n] = getattr(df[col].dt, n.lower())
df[prefix + 'Elapsed'] = df[col].astype(np.int64) // 10 ** 9
df.drop(col, axis=1, inplace=True)

In [144]:
df.head()

Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickupYear,pickupMonth,pickupWeek,pickupDay,pickupDayofweek,pickupDayofyear,pickupIs_month_end,pickupIs_month_start,pickupIs_quarter_end,pickupIs_quarter_start,pickupIs_year_end,pickupIs_year_start,pickupHour,pickupMinute,pickupSecond,pickupElapsed
0,2011-02-27 00:40:00.000000183,35.3,-73.981075,40.774302,-74.002175,40.62195,1.0,2011,2,8,27,6,58,False,False,False,False,False,False,0,40,0,1298767200
1,2010-04-10 18:57:00.000000184,10.1,-73.955883,40.71409,-73.985445,40.688862,2.0,2010,4,14,10,5,100,False,False,False,False,False,False,18,57,0,1270925820
2,2013-10-06 17:13:30.0000001,4.0,-73.976565,40.785653,-73.97099,40.793163,1.0,2013,10,40,6,6,279,False,False,False,False,False,False,17,13,30,1381079610
3,2009-01-21 19:53:00.000000116,5.7,-73.999023,40.723962,-74.01055,40.718733,5.0,2009,1,4,21,2,21,False,False,False,False,False,False,19,53,0,1232567580
4,2010-10-16 00:39:00.000000188,18.1,-73.98274,40.738695,-73.970475,40.788387,2.0,2010,10,41,16,5,289,False,False,False,False,False,False,0,39,0,1287189540


#### 2) caclucate haversine_distance

In [145]:
def haversine_distance(df, start_lat, end_lat, start_lng, end_lng, prefix):
    """
    calculates haversine distance between 2 sets of GPS coordinates in df
    """
    R = 6371  #radius of earth in kilometers
       
    phi1 = np.radians(df[start_lat])
    phi2 = np.radians(df[end_lat])
    
    delta_phi = np.radians(df[end_lat]-df[start_lat])
    delta_lambda = np.radians(df[end_lng]-df[start_lng])
    
        
    a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (R * c) #in kilometers
    df[prefix+'distance_km'] = d

In [146]:
haversine_distance(df, 'pickup_latitude', 'dropoff_latitude', 'pickup_longitude', 'dropoff_longitude', '')

In [147]:
df.head()

Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickupYear,pickupMonth,pickupWeek,pickupDay,pickupDayofweek,pickupDayofyear,pickupIs_month_end,pickupIs_month_start,pickupIs_quarter_end,pickupIs_quarter_start,pickupIs_year_end,pickupIs_year_start,pickupHour,pickupMinute,pickupSecond,pickupElapsed,distance_km
0,2011-02-27 00:40:00.000000183,35.3,-73.981075,40.774302,-74.002175,40.62195,1.0,2011,2,8,27,6,58,False,False,False,False,False,False,0,40,0,1298767200,17.033901
1,2010-04-10 18:57:00.000000184,10.1,-73.955883,40.71409,-73.985445,40.688862,2.0,2010,4,14,10,5,100,False,False,False,False,False,False,18,57,0,1270925820,3.752274
2,2013-10-06 17:13:30.0000001,4.0,-73.976565,40.785653,-73.97099,40.793163,1.0,2013,10,40,6,6,279,False,False,False,False,False,False,17,13,30,1381079610,0.957932
3,2009-01-21 19:53:00.000000116,5.7,-73.999023,40.723962,-74.01055,40.718733,5.0,2009,1,4,21,2,21,False,False,False,False,False,False,19,53,0,1232567580,1.132136
4,2010-10-16 00:39:00.000000188,18.1,-73.98274,40.738695,-73.970475,40.788387,2.0,2010,10,41,16,5,289,False,False,False,False,False,False,0,39,0,1287189540,5.621222


#### 3) parse gps

In [148]:
def parse_gps(df, prefix):
    lat = prefix + '_latitude'
    lon = prefix + '_longitude'
    df[prefix + '_x'] = np.cos(df[lat]) * np.cos(df[lon])
    df[prefix + '_y'] = np.cos(df[lat]) * np.sin(df[lon]) 
    df[prefix + '_z'] = np.sin(df[lat])
    df.drop([lat, lon], axis=1, inplace=True)

In [149]:
parse_gps(df, 'pickup')
parse_gps(df, 'dropoff')

#### 4) drop na values

In [150]:
df.dropna(inplace=True)
df.isnull().sum()

# drop rows with negative fare_amount
df = df[df['fare_amount'] > 0]

key                       0
fare_amount               0
passenger_count           0
pickupYear                0
pickupMonth               0
pickupWeek                0
pickupDay                 0
pickupDayofweek           0
pickupDayofyear           0
pickupIs_month_end        0
pickupIs_month_start      0
pickupIs_quarter_end      0
pickupIs_quarter_start    0
pickupIs_year_end         0
pickupIs_year_start       0
pickupHour                0
pickupMinute              0
pickupSecond              0
pickupElapsed             0
distance_km               0
pickup_x                  0
pickup_y                  0
pickup_z                  0
dropoff_x                 0
dropoff_y                 0
dropoff_z                 0
dtype: int64

#### 5) get fare with log

In [151]:
y = np.log(df.fare_amount)

In [152]:
y.head()
y.isnull().sum()

0    3.563883
1    2.312535
2    1.386294
3    1.740466
4    2.895912
Name: fare_amount, dtype: float64

0

In [153]:
df.drop(['key', 'fare_amount'], axis=1, inplace=True)

#### 6) split features into categorical and numerical

In [154]:
catf = ['pickupYear', 'pickupMonth', 'pickupWeek', 'pickupDay', 'pickupDayofweek', 
            'pickupDayofyear', 'pickupHour', 'pickupMinute', 'pickupSecond', 'pickupIs_month_end',
            'pickupIs_month_start', 'pickupIs_quarter_end', 'pickupIs_quarter_start',
            'pickupIs_year_end', 'pickupIs_year_start']
numf = [col for col in df.columns if col not in catf]
cat_levels = {}
for c in catf: 
    df[c] = df[c].astype('category').cat.as_ordered()
    # df[c].cat.categories
    cat_levels[c] = dict(zip(df[c].cat.codes + 1, df[c])) #dict(enumerate(df[c].cat.categories)), map function ->  key(category level) : value(category name)
    df[c] = df[c].cat.codes+1
df.head()

Unnamed: 0,passenger_count,pickupYear,pickupMonth,pickupWeek,pickupDay,pickupDayofweek,pickupDayofyear,pickupIs_month_end,pickupIs_month_start,pickupIs_quarter_end,pickupIs_quarter_start,pickupIs_year_end,pickupIs_year_start,pickupHour,pickupMinute,pickupSecond,pickupElapsed,distance_km,pickup_x,pickup_y,pickup_z,dropoff_x,dropoff_y,dropoff_z
0,1.0,3,2,8,27,7,58,1,1,1,1,1,1,1,41,1,1298767200,17.033901,-0.152707,-0.986042,0.066354,-0.169716,-0.961302,0.217014
1,2.0,2,4,14,10,6,100,1,1,1,1,1,1,19,58,1,1270925820,3.752274,-0.127077,-0.983822,0.126276,-0.15555,-0.976179,0.15126
2,1.0,5,10,40,6,7,279,1,1,1,1,1,1,18,14,31,1381079610,0.957932,-0.14836,-0.987401,0.055024,-0.142908,-0.988594,0.047524
3,5.0,1,1,4,21,3,21,1,1,1,1,1,1,20,54,1,1232567580,1.132136,-0.169593,-0.978607,0.116478,-0.180748,-0.975975,0.121669
4,2.0,2,10,41,16,6,289,1,1,1,1,1,1,1,40,1,1287189540,5.621222,-0.153885,-0.982827,0.101833,-0.142365,-0.988432,0.052294


In [155]:
len(catf)
catf

len(numf)
numf

15

['pickupYear',
 'pickupMonth',
 'pickupWeek',
 'pickupDay',
 'pickupDayofweek',
 'pickupDayofyear',
 'pickupHour',
 'pickupMinute',
 'pickupSecond',
 'pickupIs_month_end',
 'pickupIs_month_start',
 'pickupIs_quarter_end',
 'pickupIs_quarter_start',
 'pickupIs_year_end',
 'pickupIs_year_start']

9

['passenger_count',
 'pickupElapsed',
 'distance_km',
 'pickup_x',
 'pickup_y',
 'pickup_z',
 'dropoff_x',
 'dropoff_y',
 'dropoff_z']

### Split Dataset

In [156]:
def split_dataset(df, y): return train_test_split(df, y, test_size=0.25, random_state=42)

In [157]:
X_train, X_test, y_train, y_test = split_dataset(df, y)

X_train.shape
X_test.shape

(525450, 24)

(175151, 24)

In [158]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(data1)
# data2 = scaler.transform(data1)

# data1.std(), data2.std()


# fit() : estimate mean/var using method and training data set
def get_numf_scaler(train): return preprocessing.StandardScaler().fit(train)

# trainsform() : convert the training data set unsing method
def scale_numf(df, numf, scaler):
    cols = numf
    index = df.index
    scaled = scaler.transform(df[numf])
    scaled = pd.DataFrame(scaled, columns=cols, index=index)
    return pd.concat([scaled, df.drop(numf, axis=1)], axis=1)

In [159]:
scaler = get_numf_scaler(X_train[numf])
print(scaler.mean_)
print(scaler.var_)

X_train_sc = scale_numf(X_train, numf, scaler)
X_train_sc.std(axis=0)

X_test_sc = scale_numf(X_test, numf, scaler)
X_test_sc.std(axis=0)

[ 1.68633552e+00  1.33235745e+09  1.97288967e+01 -1.24580812e-01
 -9.64163420e-01  8.83457557e-02 -1.23905929e-01 -9.64369392e-01
  8.80739664e-02]
[1.79703186e+00 3.39653183e+15 1.36477865e+05 2.62337215e-02
 1.92821308e-02 1.54769615e-03 2.59504100e-02 1.91584858e-02
 1.77307735e-03]


passenger_count             1.000001
pickupElapsed               1.000001
distance_km                 1.000001
pickup_x                    1.000001
pickup_y                    1.000001
pickup_z                    1.000001
dropoff_x                   1.000001
dropoff_y                   1.000001
dropoff_z                   1.000001
pickupYear                  1.858894
pickupMonth                 3.434707
pickupWeek                 14.936537
pickupDay                   8.687571
pickupDayofweek             1.947710
pickupDayofyear           104.730323
pickupIs_month_end          0.178508
pickupIs_month_start        0.173699
pickupIs_quarter_end        0.103784
pickupIs_quarter_start      0.097457
pickupIs_year_end           0.049257
pickupIs_year_start         0.045913
pickupHour                  6.518405
pickupMinute               17.334048
pickupSecond               19.427601
dtype: float64

passenger_count             0.976197
pickupElapsed               1.002186
distance_km                 1.019741
pickup_x                    0.993961
pickup_y                    0.998975
pickup_z                    1.010682
dropoff_x                   0.997654
dropoff_y                   1.000235
dropoff_z                   0.998402
pickupYear                  1.862622
pickupMonth                 3.435877
pickupWeek                 14.946657
pickupDay                   8.691328
pickupDayofweek             1.948226
pickupDayofyear           104.792820
pickupIs_month_end          0.178084
pickupIs_month_start        0.173924
pickupIs_quarter_end        0.103640
pickupIs_quarter_start      0.097725
pickupIs_year_end           0.047615
pickupIs_year_start         0.047315
pickupHour                  6.523568
pickupMinute               17.309792
pickupSecond               19.416418
dtype: float64

---
## Modeling with PyTorch
---

### Defining pytorch dataset and dataloaders

In [163]:
X_train_sc.head()

Unnamed: 0,passenger_count,pickupElapsed,distance_km,pickup_x,pickup_y,pickup_z,dropoff_x,dropoff_y,dropoff_z,pickupYear,pickupMonth,pickupWeek,pickupDay,pickupDayofweek,pickupDayofyear,pickupIs_month_end,pickupIs_month_start,pickupIs_quarter_end,pickupIs_quarter_start,pickupIs_year_end,pickupIs_year_start,pickupHour,pickupMinute,pickupSecond
589188,0.233985,-0.474207,-0.049888,-0.124584,-0.153987,-0.008826,-0.190866,-0.136201,0.20571,3,5,18,6,5,126,1,1,1,1,1,1,23,27,24
148536,-0.511987,-0.439536,-0.042465,-0.048265,-0.180697,-0.669434,-0.203682,-0.136492,0.115329,3,5,22,30,1,150,1,1,1,1,1,1,8,44,20
353394,-0.511987,-0.575693,-0.050348,-0.1824,-0.131328,0.436821,-0.266783,-0.113376,0.469281,3,2,8,27,7,58,1,1,1,1,1,1,12,30,30
209270,-0.511987,-1.664777,-0.048261,-0.335047,-0.103139,0.368917,-0.245041,-0.111626,0.643027,1,2,8,22,7,53,1,1,1,1,1,1,21,31,23
328273,-0.511987,-0.867024,-0.048209,-0.171314,-0.146198,-0.025918,-0.151058,-0.137789,0.38415,2,8,32,14,6,226,1,1,1,1,1,1,24,12,1


In [164]:
X_train_sc.shape

(525450, 24)

In [165]:
import torch.utils.data as data_utils
# # split test and train data set (100,000 for test vs remaining for training)
# train_dataset = data_utils.TensorDataset(torch.tensor(users[batch_size:], dtype = torch.long), torch.tensor(items[batch_size:], dtype = torch.long), torch.tensor(ratings[batch_size:], dtype = torch.float32))
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle=True, num_workers=30)

# train
X_train_cat = X_train_sc[catf]
X_train_num = X_train_sc.drop(catf, axis = 1)

X_train_cat = torch.tensor(np.array(X_train_cat), dtype = torch.long)
X_train_num = torch.tensor(np.array(X_train_num), dtype = torch.float32)
Y_train = torch.tensor(np.array(y_train), dtype = torch.float32)

print(X_train_cat.size())
print(X_train_num.size())

# vaild
X_test_cat = X_test_sc[catf]
X_test_num = X_test_sc.drop(catf, axis = 1)

X_test_cat = torch.tensor(np.array(X_test_cat), dtype = torch.long)
X_test_num = torch.tensor(np.array(X_test_num), dtype = torch.float32)
Y_test = torch.tensor(np.array(y_test), dtype = torch.float32)

print(X_test_cat.size())
print(X_test_num.size())

train_dataset = data_utils.TensorDataset(X_train_cat, X_train_num, Y_train)
test_dataset = data_utils.TensorDataset(X_test_cat, X_test_num, Y_test)

torch.Size([525450, 15])
torch.Size([525450, 9])
torch.Size([175151, 15])
torch.Size([175151, 9])


In [166]:
params = {'batch_size':500,
         'shuffle':True,
         'num_workers':8}

# traindl = data.DataLoader(trainds, **params)
# valdl = data.DataLoader(valds, **params)

In [167]:
train_dataloader = data.DataLoader(train_dataset, **params)
test_dataloader = data.DataLoader(test_dataset, **params)

In [168]:
print(iter(train_dataloader).next()[0].size())
print(iter(train_dataloader).next()[1].size())
print(iter(train_dataloader).next()[2].size())

torch.Size([500, 15])
torch.Size([500, 9])
torch.Size([500])


### Defining model and related variables

In [169]:
y_range = (0, y_train.max()*1.2)
y_range

cat_sz = [(c, df[c].max()+1) for c in catf]
cat_sz

emb_szs = [(c,min(50, (c+1)//2)) for _, c in cat_sz]
emb_szs

(0, 7.457529718106629)

[('pickupYear', 8),
 ('pickupMonth', 13),
 ('pickupWeek', 54),
 ('pickupDay', 32),
 ('pickupDayofweek', 8),
 ('pickupDayofyear', 367),
 ('pickupHour', 25),
 ('pickupMinute', 61),
 ('pickupSecond', 61),
 ('pickupIs_month_end', 3),
 ('pickupIs_month_start', 3),
 ('pickupIs_quarter_end', 3),
 ('pickupIs_quarter_start', 3),
 ('pickupIs_year_end', 3),
 ('pickupIs_year_start', 3)]

[(8, 4),
 (13, 7),
 (54, 27),
 (32, 16),
 (8, 4),
 (367, 50),
 (25, 13),
 (61, 31),
 (61, 31),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2)]

In [170]:
class MixedInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, szs, emb_drop, drops, y_range, use_bn=True):
        super().__init__()

        # categorical variable을 위한 embedding matrix 생성 -> ModuleList로 묶기
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        # initialize embeddings
#         for emb in self.embs:
#             emb_init(emb)
        # embedding size합
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb = n_emb
        self.n_cont = n_cont
        
        szs = [n_emb+n_cont] + szs # Linear의 input dim(embedding + cont) + Linear dim 더하기
        # Linear Layer
        self.lins = nn.ModuleList([nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        # Batch Normalization
        self.bns = nn.ModuleList([nn.BatchNorm1d(sz) for sz in szs[1:]]) # 첫번째 Linear out부터
        
        #for o in self.lins: nn.init.kaiming_normal_(o.weight.data)
        
        # Output Layer
        self.outp = nn.Linear(szs[-1], out_sz)
        #nn.init.kaiming_normal_(self.outp.weight.data)

        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops])
        
        # Numeric input을 위한 BatchNorm
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn,self.y_range = use_bn,y_range

    def forward(self, x_cat, x_cont):      
        # categorical input
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
            x = torch.cat(x, 1)
            #x = self.emb_drop(x)
        
        # numeric input
        if self.n_cont != 0:
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
        
        # 
        for l,b,d in zip(self.lins, self.bns, self.drops):
            x = F.relu(l(x))
            if self.use_bn:
                x = b(x)
            x = d(x)
        x = self.outp(x)
        if self.y_range:
            x = torch.sigmoid(x)
            x = x*(self.y_range[1] - self.y_range[0])
            x = x+self.y_range[0]
            
        return x.squeeze()

def rmse(targ, y_pred):
    return np.sqrt(mean_squared_error(inv_y(y_pred), inv_y(targ))) #.detach().numpy()

def emb_init(x):
    x = x.weight.data
    sc = 2/(x.size(1)+1)
    x.uniform_(-sc,sc)
    
def inv_y(y): return np.exp(y)

In [171]:
# set device type
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cuda:0


In [172]:
# create model instance and initialize
m = MixedInputModel(emb_szs=emb_szs, 
                    n_cont=len(df.columns)-len(catf), 
                    emb_drop=0.04, 
                    out_sz=1, 
#                     szs=[1000,500,250], 
                    szs=[100,50,30], 
                    drops=[0.001,0.01,0.01], 
                    y_range=y_range).to(device)

opt = optim.Adam(m.parameters(), 1e-2)
lr_cosine = lr_scheduler.CosineAnnealingLR(opt, 1000)

lr = defaultdict(list)
tloss = defaultdict(list)
vloss = defaultdict(list)

for param in m.parameters():
    param.requires_grad = True

m

MixedInputModel(
  (embs): ModuleList(
    (0): Embedding(8, 4)
    (1): Embedding(13, 7)
    (2): Embedding(54, 27)
    (3): Embedding(32, 16)
    (4): Embedding(8, 4)
    (5): Embedding(367, 50)
    (6): Embedding(25, 13)
    (7): Embedding(61, 31)
    (8): Embedding(61, 31)
    (9): Embedding(3, 2)
    (10): Embedding(3, 2)
    (11): Embedding(3, 2)
    (12): Embedding(3, 2)
    (13): Embedding(3, 2)
    (14): Embedding(3, 2)
  )
  (lins): ModuleList(
    (0): Linear(in_features=204, out_features=100, bias=True)
    (1): Linear(in_features=100, out_features=50, bias=True)
    (2): Linear(in_features=50, out_features=30, bias=True)
  )
  (bns): ModuleList(
    (0): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (outp): Linear(in_features=30, out_features=1, bias=True)
 

In [173]:
def model_train(model, train_dl, val_dl, loss_fn, opt, scheduler, epochs=3):
    # total batch number
    num_batch = len(train_dl)

    # epoch 별 train / validatoin loss 적재
    train_loss = [] # 
    val_loss = []

    for epoch in tnrange(epochs):      
        y_true_train = list()
        y_pred_train = list()
        t = tqdm_notebook(iter(train_dl), leave=False, total=num_batch) # t.total -> 총 iteration

        total_loss_train = 0.0
        tot_batch = 0
        for cat, cont, y in t:
            size_minibatch = cat.size()[0]
            cat = cat.to(device)
            cont = cont.to(device)
            y = y.to(device)

    #             t.set_description(f'Epoch {epoch}')

            opt.zero_grad()
            pred = model(cat, cont)
            loss = loss_fn(pred, y)
    #         if np.isnan(loss.to('cpu').detach().numpy().item()):
    #             break
            loss.backward()
            lr[epoch].append(opt.param_groups[0]['lr'])
            tloss[epoch].append(loss.item())
            scheduler.step()
            opt.step()

            t.set_postfix(loss=loss.item())

            y_true_train += list(y.to('cpu').data.numpy())
            y_pred_train += list(pred.to('cpu').data.numpy())
            total_loss_train += loss.item()*size_minibatch
            tot_batch += size_minibatch

        epoch_loss = total_loss_train / tot_batch     
        train_loss.append((epoch, epoch_loss, 'train_loss'))        
        train_acc = rmse(y_true_train, y_pred_train)



        if val_dl:
            y_true_val = list()
            y_pred_val = list()

            total_loss_val = 0.0
            total_val_batch = 0
            for cat, cont, y in tqdm_notebook(val_dl, leave=False):
                val_size_minibatch = cat.size()[0]
                cat = cat.to(device)
                cont = cont.to(device)
                y = y.to(device)
                pred = model(cat, cont)
                loss = loss_fn(pred, y)

                vloss[epoch].append(loss.item())

                y_true_val += list(y.to('cpu').data.numpy())
                y_pred_val += list(pred.to('cpu').data.numpy())      
                total_loss_val += loss.item()*val_size_minibatch
                total_val_batch += val_size_minibatch


            epoch_val_loss = total_loss_val / total_val_batch   
            val_loss.append((epoch, epoch_val_loss, 'val_loss'))
            val_acc = rmse(y_true_val, y_pred_val)


        if val_dl:
            print('epoch {} / train_loss {:.4f} / val loss {:.4f} / train_acc {:.4f} / val_acc {:.4f}'.format(epoch, epoch_loss, epoch_val_loss, train_acc, val_acc))
        else:
            print('epoch {} / train_loss {:.4f} / train_acc {:.4f}'.format(epoch, epoch_loss, train_acc))

    return train_loss, val_loss

In [174]:
res = model_train(m, train_dataloader, test_dataloader, F.mse_loss, opt, lr_cosine, 10)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))

HBox(children=(IntProgress(value=0, max=351), HTML(value='')))

epoch 0 / train_loss 0.1901 / val loss 0.1210 / train_acc 7.5116 / val_acc 5.7821


HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))

HBox(children=(IntProgress(value=0, max=351), HTML(value='')))

epoch 1 / train_loss 0.1110 / val loss 0.0890 / train_acc 5.7710 / val_acc 5.1384


HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))

HBox(children=(IntProgress(value=0, max=351), HTML(value='')))

epoch 2 / train_loss 0.0833 / val loss 0.0797 / train_acc 5.1606 / val_acc 4.7942


HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))

HBox(children=(IntProgress(value=0, max=351), HTML(value='')))

epoch 3 / train_loss 0.0834 / val loss 0.0832 / train_acc 5.1682 / val_acc 4.9772


HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))

HBox(children=(IntProgress(value=0, max=351), HTML(value='')))

epoch 4 / train_loss 0.0785 / val loss 0.0769 / train_acc 5.0566 / val_acc 4.7655


HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))

HBox(children=(IntProgress(value=0, max=351), HTML(value='')))

epoch 5 / train_loss 0.0800 / val loss 0.0809 / train_acc 5.0838 / val_acc 4.8193


HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))

HBox(children=(IntProgress(value=0, max=351), HTML(value='')))

epoch 6 / train_loss 0.0754 / val loss 0.0759 / train_acc 4.9552 / val_acc 4.7884


HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))

HBox(children=(IntProgress(value=0, max=351), HTML(value='')))

epoch 7 / train_loss 0.0778 / val loss 0.0765 / train_acc 5.0348 / val_acc 4.7489


HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))

HBox(children=(IntProgress(value=0, max=351), HTML(value='')))

epoch 8 / train_loss 0.0735 / val loss 0.0767 / train_acc 4.8912 / val_acc 4.7849


HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))

HBox(children=(IntProgress(value=0, max=351), HTML(value='')))

epoch 9 / train_loss 0.0769 / val loss 0.0748 / train_acc 5.0099 / val_acc 4.7028
