In [1]:
%run preamble.ipynb

The preamble handles a few imports and defines the following functions:

printLoss(y_tr,p_tr,y_te,p_te,loss=log_loss)
createDataSet(predictions, group_encoder, device_ids)
getBestPrediction(data,var='device_id')


A few Debug functions


#### Model variables
We'll build our model based on:
* `device_model`
* `phone_brand`
* `usageDay`
* `hour`
* `nEvts`

In [2]:
data = pd.read_csv("files/finalSets/evts_noApp_phone.csv")
data = data.drop(["event_id","longitude","latitude","day","time"],axis=1)

In [3]:
data.columns

Index(['device_id', 'hour', 'usageDay', 'isTrain', 'group', 'phone_brand',
       'device_model', 'nEvts'],
      dtype='object')

In [4]:
train_drop = ["group","device_id"]

In [5]:
enc_brand = LabelEncoder()
enc_device = LabelEncoder()
enc_group = LabelEncoder()
data["phone_brand"] = enc_brand.fit_transform(data.phone_brand)
data["device_model"] = enc_device.fit_transform(data.device_model)
data["group"] = enc_group.fit_transform(data.group)

#scaler_long = RobustScaler()
#scaler_lat = RobustScaler()
#scaler_nevts = RobustScaler()
#data["latitude"] = scaler_lat.fit_transform(data.latitude.reshape(-1,1))
#data["longitude"] = scaler_long.fit_transform(data.longitude.reshape(-1,1))
#data["nEvts"] = scaler_nevts.fit_transform(data.nEvts.reshape(-1,1))

lr_enc = OneHotEncoder()#categorical_features=[0,1,2,3,4])
lr_enc.fit(data.drop(train_drop+["isTrain"],axis=1))

OneHotEncoder(categorical_features='all', dtype=<class 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [6]:
data.head(2)

Unnamed: 0,device_id,hour,usageDay,isTrain,group,phone_brand,device_model,nEvts
0,29182687948017175,0,3,1,11,0,143,256
1,-4833982096941402721,0,3,1,11,0,426,248


### Split in train and test samples

In [7]:
drop_list = ["isTrain"]
train = data[data.isTrain==1].drop( drop_list, axis=1)
true_classes = train.group
test = data[data.isTrain==0].drop( drop_list+["group"], axis=1)

In [8]:
x_train, x_val, y_train, y_val = train_test_split(train
                                                  , true_classes
                                                  , test_size=0.3
                                                  , random_state=999)

In [9]:
x_train.columns

Index(['device_id', 'hour', 'usageDay', 'group', 'phone_brand', 'device_model',
       'nEvts'],
      dtype='object')

# Baseline model: linear logistic regression

In [7]:
lr = joblib.load("trainedModels/lr_evts_nEvtsWoLatLong.pkl")

In [41]:
lr = LogisticRegression(penalty='l2'
                        , C=0.4 # was 0.05
                        , tol=0.001 # was 0.0001
                        , solver='lbfgs'
                        , max_iter=300
                        #, warm_start=True
                        , multi_class='multinomial'
                        , verbose =1
                        , n_jobs=3)

In [42]:
s=time.time()
lr.fit(lr_enc.transform(x_train.drop(train_drop,axis=1)),y_train)
print((time.time()-s)/60.0, " minutes")

3.9555771946907043  minutes


[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:  3.9min finished


In [43]:
probs_lr_train = lr.predict_proba(lr_enc.transform(x_train.drop(train_drop,axis=1)))
probs_lr_val = lr.predict_proba(lr_enc.transform(x_val.drop(train_drop,axis=1)))

printLoss(y_train, probs_lr_train, y_val, probs_lr_val)

Test MVA predictions on test and training set:

Log loss on training set:  0.977170403555
Log loss on test set:  0.987642916307


In [44]:
joblib.dump(lr, "trainedModels/lr_evts_nEvtsWoLatLong_LargeC04.pkl", compress=3)

['trainedModels/lr_evts_nEvtsWoLatLong_LargeC04.pkl']

* c04_t0.001: 0.987 test (vs 0.977)

In [45]:
probs_lr_train = createDataSet(probs_lr_train, enc_group, x_train.device_id.values)

probs_lr_val = createDataSet(probs_lr_val, enc_group, x_val.device_id.values)

### Now we take the maximum probability of highest mean class prediction

In [46]:
probs_lr_train = getBestPrediction(probs_lr_train)
probs_lr_val = getBestPrediction(probs_lr_val)

3001 / 11002  groups processed...
6001 / 11002  groups processed...
9001 / 11002  groups processed...
3001 / 10070  groups processed...
6001 / 10070  groups processed...
9001 / 10070  groups processed...


In [47]:
printLoss(y_train.iloc[probs_lr_train.index.values]
          , probs_lr_train.drop("device_id",axis=1).as_matrix()
          , y_val.iloc[probs_lr_val.index.values]
          , probs_lr_val.drop("device_id",axis=1).as_matrix())

Test MVA predictions on test and training set:

Log loss on training set:  2.07851596885
Log loss on test set:  2.02997305798


* 2.030 (test vs train 2.078) C0.4t0l0.001

### Predict actual test devices

In [48]:
probs_test = lr.predict_proba(lr_enc.transform(test.drop(["device_id"],axis=1)))

In [24]:
#av_probs_test = averagePredictions(probs_test
#, test.device_id.values)

In [49]:
probs_test = pd.DataFrame(probs_test)
probs_test.columns = enc_group.inverse_transform(probs_test.columns.values)
probs_test["device_id"] = test.device_id.values
probs_test.to_csv("finalOutputs/lr_Evts_woLatLong.csv", index=False)

# Random forest

In [9]:
rf = RandomForestClassifier(n_estimators=270
                            , criterion='entropy'
                            , min_samples_split=5
                            , min_samples_leaf=1
                            , max_leaf_nodes=None
                            , n_jobs=2
                            , random_state=666
                            , verbose=1
                            #, warm_start=True
                           )

In [11]:
x_train.drop(train_drop,axis=1).columns

Index(['hour', 'usageDay', 'phone_brand', 'device_model', 'nEvts'], dtype='object')

In [10]:
s=time.time()/60.0
rf.fit(x_train.drop(train_drop,axis=1),y_train)
print("rf done in ", (time.time()/60.0)-s, " minutes.")

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   40.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  2.9min


rf done in  3.939246840775013  minutes.


[Parallel(n_jobs=2)]: Done 270 out of 270 | elapsed:  3.9min finished


In [11]:
probs_rf_train = rf.predict_proba(x_train.drop(train_drop,axis=1))
probs_rf_val = rf.predict_proba(x_val.drop(train_drop,axis=1))

printLoss(y_train, probs_rf_train, y_val, probs_rf_val)

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   37.3s
[Parallel(n_jobs=2)]: Done 270 out of 270 | elapsed:   55.9s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    3.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   14.3s
[Parallel(n_jobs=2)]: Done 270 out of 270 | elapsed:   22.7s finished


Test MVA predictions on test and training set:

Log loss on training set:  0.0834510049296
Log loss on test set:  0.182870837665


In [12]:
probs_rf_train = createDataSet(probs_rf_train, enc_group, x_train.device_id.values)
probs_rf_val = createDataSet(probs_rf_val, enc_group, x_val.device_id.values)

In [13]:
probs_rf_train = getBestPrediction(probs_rf_train)
probs_rf_val = getBestPrediction(probs_rf_val)
printLoss(y_train.iloc[probs_rf_train.index.values]
          , probs_rf_train.drop("device_id",axis=1).as_matrix()
          , y_val.iloc[probs_rf_val.index.values]
          , probs_rf_val.drop("device_id",axis=1).as_matrix())

3001 / 11002  groups processed...
6001 / 11002  groups processed...
9001 / 11002  groups processed...
3001 / 10070  groups processed...
6001 / 10070  groups processed...
9001 / 10070  groups processed...
Test MVA predictions on test and training set:

Log loss on training set:  0.20624902705
Log loss on test set:  0.446713947223


* RF (n_estimators=100, criterion='entropy'
     , min_samples_split=2, min_samples_leaf=1)
     * test score: 0.624 (0.070 train)
* RF(... same, but criterion='gini')
    * test score: 0.77 (0.070 train)
* min_sample_split=100
    * test: 1.53 (train 1.53)
* min_sample_split=20, n_est=300
    * test: 0.83 (train 0.76)
* min_sample_split=4, n_est=444 (mem issues)
    * test: 0.389 (train=0.156)
* min_sample split=6, n=350 (prediction problem (memory), try just loading it)
    * test: 0.442 (train 0.258)
* min_sample 8, n_est 320
    * test: 0.504 (train:0.353 )
* min sample 5, n_est 270
    * test: 0.447 (train 0.206)

In [14]:
#joblib.dump(rf, "trainedModels/rf_350_split6_entropy.pkl", compress=3)
joblib.dump(rf, "trainedModels/rf_270_split5_entropy.pkl", compress=3)

['trainedModels/rf_270_split5_entropy.pkl']

### predict test dataset

In [11]:
#rf=joblib.load("trainedModels/rf_")

In [15]:
probs_rf_test = rf.predict_proba(test.drop("device_id",axis=1))

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   14.9s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done 270 out of 270 | elapsed:  2.2min finished


In [17]:
help(createDataSet)

Help on function createDataSet in module __main__:

createDataSet(predictions, group_encoder, device_ids)
    Creates prediction dataset to save into csv from 
    multiclass predictions (ndarray).
    
    Arguments:
    predictions   - (ndarray) predictions from the MVA.
    group_encoder - (LabelEncoder) to transform column names.
    device_ids    - (pd.Series, array...) respective device_ids.



In [19]:
probs_rf_test = createDataSet(probs_rf_test, enc_group, test.device_id.values)

In [21]:
probs_rf_test.to_csv("finalOutputs/rf_evts_270_split5_entropy.csv", index=False)

# Gradient boosting classifier

In [26]:
gbdt = GradientBoostingClassifier(loss='deviance'
                                  ,max_features=None
                                  , min_samples_leaf=800
                                  , learning_rate=0.01
                                  , n_estimators=300);

In [10]:
gbdt = joblib.load("trainedModels/gbdt_evts.pkl")

In [28]:
s=time.time()/60.0
gbdt.fit(x_train.drop(train_drop,axis=1),y_train)
print("gradient boosting done in ", (time.time()/60.0)-s, " minutes.")

gradient boosting done in  62.727684278041124  minutes.


In [34]:
p_gbdt_test = gbdt.predict_proba(test.drop(["device_id"],axis=1))

In [35]:
p_gbdt_val = gbdt.predict_proba(x_val.drop(train_drop,axis=1))
p_gbdt_train = gbdt.predict_proba(x_train.drop(train_drop,axis=1))

In [36]:
printLoss(y_val,p_gbdt_val, y_train, p_gbdt_train)

Test MVA predictions on test and training set:

Log loss on training set:  1.9056459881
Log loss on test set:  1.90565042031


In [38]:
av_val = averagePredictions(p_gbdt_val, x_val.device_id.values, x_val.group.values)
av_train = averagePredictions(p_gbdt_train, x_train.device_id.values, x_train.group.values)



In [39]:
printLoss(av_train.group, av_train.drop(train_drop,axis=1).as_matrix()
          , av_val.group, av_val.drop(train_drop,axis=1).as_matrix())

Test MVA predictions on test and training set:

Log loss on training set:  2.29086334479
Log loss on test set:  2.28474007692


#### This screams for optimisation... for now, we can go with the logreg solution.