### Notes
- I had tried building a new dataset in "2020-06-24" but kernel kept on exploding. 
- So here, first I tried iterating through chunks of preprocessing at a time. That still exploded the kernel
- But then [midway](#2020-06-27) I tried out a really cool numpy feature where you can save an array to a file in append mode. 
- And the preprocessing step (using encoders to build transformed data and saving that to disk) , on the `843,416` rows here, took about `39` seconds. 

In [1]:
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
import datetime; import pytz
import matplotlib.pyplot as plt
from scipy.special import softmax
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split # (*arrays, **options)
import numpy as np
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from joblib import dump, load
import joblib
import os
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import fresh.utils as fu

from importlib import reload
from collections import Counter
from tqdm.notebook import tqdm
import fresh.preproc.v1 as pv1
import fresh.preproc.v2 as pv2

In [2]:
datadir = '/opt/data'
localdir = '/opt/program'


tripsdf = pd.read_csv(f'{datadir}/2013-07 - Citi Bike trip data.csv'
                     )#.sample(frac=0.017, random_state=42)
stationsdf = pd.read_csv(f'{localdir}/datas/stations/stations-2018-12-04-c.csv',
                        index_col=0)


In [3]:

X, y, neighborhoods = fu.prepare_data(tripsdf, stationsdf)
# 
# future thinking here...
# ..disk approach => return the location of X, y on disk instead 

In [4]:
workdir = fu.make_work_dir(); print(workdir)
fu.log(workdir, 'Starting')

/opt/program/artifacts/2020-06-26T142154Z


In [7]:
# place a small section of the pre-processed data into a target file..
x_outfile = f'{workdir}/x_transformed.csv'
y_outfile = f'{workdir}/y_enc.csv'

X_transformed, y_enc, proc_dict = pv2.preprocess(
        X[:1000], y[:1000], neighborhoods, labeled=True)

100%|██████████| 10/10 [00:00<00:00, 359.14it/s]


In [11]:
np.savetxt(x_outfile, X_transformed, delimiter=',')

In [16]:
(X_transformed.shape, y_enc.shape, 
 np.hstack((np.resize(y_enc, (1000, 1)), X_transformed)).shape)

((1000, 85), (1000,), (1000, 86))

In [18]:
both_outfile = f'{workdir}/minidata.csv'
yx_data = np.hstack((np.resize(y_enc, (1000, 1)), X_transformed))
np.savetxt(both_outfile, yx_data, delimiter=',', fmt='%u')

### 2020-06-27

#### trying a looped numpy append technique
* Read [here](https://stackoverflow.com/questions/27786868/python3-numpy-appending-to-a-file-using-numpy-savetxt#27980725) that you can pass a file description to `np.savetxt` to accomplish appending. 

In [28]:
# just writing same data a few times first..
both_outfile = f'{workdir}/minidata.csv'
yx_data = np.hstack((np.resize(y_enc, (1000, 1)), X_transformed))
with open(both_outfile, 'ab') as fd:
    np.savetxt(fd, yx_data, delimiter=',', fmt='%u')
    np.savetxt(fd, yx_data, delimiter=',', fmt='%u')
    

In [29]:
array = np.loadtxt(both_outfile, delimiter=',')
array.shape

(3000, 86)

In [31]:
# nice!!! that worked .
dataset_name = None#'train'
outfile = f'{workdir}/{dataset_name or "data"}.csv'
outfile

'/opt/program/artifacts/2020-06-26T142154Z/data.csv'

In [82]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y)
proc_bundle, train_loc = pv2.preprocess(
        X_train, y_train, neighborhoods, workdir=workdir,
        dataset_name='train')
print(train_loc)
bundle_loc = f'{workdir}/proc_bundle.joblib'
joblib.dump({'notebook': '2020-06-26.ipynb',
            'proc_bundle': proc_bundle,
            },
           f'{workdir}/proc_bundle.joblib')
print('Done ', bundle_loc)
test_loc = pv2.preprocess(
        X_test, y_test, neighborhoods, proc_bundle=proc_bundle,
        workdir=workdir,
        dataset_name='test')
print('Done ', test_loc)

100%|██████████| 11/11 [00:46<00:00,  4.25s/it]
  0%|          | 0/11 [00:00<?, ?it/s]

/opt/program/artifacts/2020-06-26T142154Z/train.libsvm
Done  /opt/program/artifacts/2020-06-26T142154Z/proc_bundle.joblib


100%|██████████| 11/11 [00:15<00:00,  1.45s/it]

Done  /opt/program/artifacts/2020-06-26T142154Z/test.libsvm
CPU times: user 29.6 s, sys: 690 ms, total: 30.3 s
Wall time: 1min 3s





Ok cool that appears to have worked 

In [83]:
proc_bundle, outfile

({'enc': OneHotEncoder(categories=[['Alphabet City', 'Battery Park City',
                             'Bedford-Stuyvesant', 'Bloomingdale', 'Boerum Hill',
                             'Bowery', 'Broadway Triangle', 'Brooklyn Heights',
                             'Brooklyn Navy Yard', 'Carnegie Hill',
                             'Carroll Gardens', 'Central Park', 'Chelsea',
                             'Chinatown', 'Civic Center', 'Clinton Hill',
                             'Cobble Hill', 'Columbia Street Waterfront District',
                             'Downtown Brooklyn', 'Dumbo', 'East Harlem',
                             'East Village', 'East Williamsburg',
                             'Financial District', 'Flatiron District',
                             'Fort Greene', 'Fulton Ferry District',
                             'Garment District', 'Governors Island', 'Gowanus', ...],
                            [0, 1, 2], [0, 1, 2, 3, 4]],
                drop=None, dtype=<class 

In [84]:
!ls -lh '/opt/program/artifacts/2020-06-26T142154Z/train.csv'

-rw-r--r-- 1 root root 139M Jun 27 18:38 /opt/program/artifacts/2020-06-26T142154Z/train.csv


In [85]:
# Check if class distribution for split was good...

records = [
    {k:v/x[1] for (k,v) in x[0]} for x in 
[
[[list(a.items()), sum(a.values())] 
                           for a in [dict(Counter(dd))]][0]    

 for dd in [y, y_train, y_test]
]
]


In [86]:
# nice. randomness for the win... class distribution looking good here.
proportionsdf = pd.DataFrame.from_records(records).T
proportionsdf

Unnamed: 0,0,1,2
Stuyvesant Town,0.013765,0.01372,0.013901
Gramercy Park,0.016522,0.016575,0.016362
Theater District,0.025667,0.025719,0.025511
East Village,0.016433,0.016373,0.016613
Chelsea,0.109935,0.109766,0.110441
Union Square,0.013583,0.013524,0.013758
Rose Hill,0.007548,0.007501,0.007688
Midtown West,0.036301,0.036243,0.036475
Midtown East,0.048963,0.049088,0.048588
Murray Hill,0.021489,0.021377,0.021826


In [20]:
# quick test one more time, can I use a DMatrix on xgb.XGBClassifier ? 
# no not really... 
dmatrix = xgb.DMatrix(
    f'{both_outfile}?format=csv&label_column=0&delimiter=,')


[15:28:46] 1000x85 matrix with 85000 entries loaded from /opt/program/artifacts/2020-06-26T142154Z/minidata.csv?format=csv&label_column=0&delimiter=,


In [21]:
xgb_model = xgb.XGBClassifier(objective='multi:softprob')
xgb_model.fit(dmatrix,  verbose=True)

TypeError: fit() missing 1 required positional argument: 'y'

##### thoughts on params to mess with..
num_round , make sure at least 100
gamma, 0
max_delta_step , 1
n_estimators , >100..
min_child_weight, 30
max_depth , 3,4,5,6...
colsample_bytree, 0.4..1.0 
subsample, 0.5..1.0


#### references
- [this](https://github.com/aws/sagemaker-xgboost-container/blob/master/src/sagemaker_xgboost_container/data_utils.py) is a handy reference around xgb utils

In [35]:
os.getpid()

1342