In [18]:
import pandas as pd
from pandas import DataFrame as df
import seaborn as sns
import sys
import numpy as np
import time

%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

sns.set_style('ticks')
sys.path.append("/home/mschlupp/pythonTools")

In [120]:
# check what we have to do first
evts = pd.read_csv("files/finalSets/evts_noApp_phone.csv", nrows=2)

In [121]:
evts

Unnamed: 0,event_id,device_id,longitude,latitude,day,time,hour,usageDay,isTrain,group,phone_brand,device_model,nEvts
0,1,29182687948017175,121.38,31.24,Sun,00:55:25,0,3,1,M39+,小米,红米note,256
1,3,-4833982096941402721,106.6,29.7,Sun,00:08:05,0,3,1,M39+,魅族,MX4 Pro,248


#### Model variables
We'll build our model based on:
* `device_model`
* `phone_brand`
* `usageDay`
* `hour`
* `nEvts`
* `longitude` and `latitude`

In [122]:
data = pd.read_csv("files/finalSets/evts_noApp_phone.csv")
data = data.drop(["event_id","day","time"],axis=1)

In [184]:
enc_brand = LabelEncoder()
enc_device = LabelEncoder()
enc_group = LabelEncoder()
data["phone_brand"] = enc_brand.fit_transform(data.phone_brand)
data["device_model"] = enc_device.fit_transform(data.device_model)
data["group"] = enc_group.fit_transform(data.group)

scaler_long = RobustScaler()
scaler_lat = RobustScaler()
data["latitude"] = scaler_lat.fit_transform(data.latitude.reshape(-1,1))
data["longitude"] = scaler_long.fit_transform(data.longitude.reshape(-1,1))

lr_enc = OneHotEncoder(categorical_features=[2,3,4,5,6])
lr_enc.fit(data.drop(["isTrain","device_id","group"],axis=1))

OneHotEncoder(categorical_features=[2, 3, 4, 5, 6], dtype=<class 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [53]:
data.head(2)

Unnamed: 0,device_id,longitude,latitude,hour,usageDay,isTrain,group,phone_brand,device_model,nEvts
0,29182687948017175,0.9,0.003012,0,3,1,11,24,710,256
1,-4833982096941402721,-1.273529,-0.151606,0,3,1,11,64,340,248


### Split in train and test samples

In [185]:
drop_list = ["isTrain"]
train = data[data.isTrain==1].drop( drop_list, axis=1)
true_classes = train.group
test = data[data.isTrain==0].drop( drop_list+["group"], axis=1)

In [186]:
x_train, x_val, y_train, y_val = train_test_split(train
                                                  , true_classes, test_size=0.3
                                                  ,random_state=999)

In [187]:
train_drop = ["group","device_id"]

In [188]:
x_train.columns

Index(['device_id', 'longitude', 'latitude', 'hour', 'usageDay', 'group',
       'phone_brand', 'device_model', 'nEvts'],
      dtype='object')

# Baseline model: linear logistic regression

In [203]:
lr = LogisticRegression(penalty='l2'
                        , C=0.1 # was 0.05
                        , tol=0.00001 # was 0.0001
                        , solver='lbfgs'
                        , max_iter=600
                        , multi_class='multinomial')

In [204]:
lr.fit(lr_enc.transform(x_train.drop(train_drop,axis=1)),y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=600, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=1e-05, verbose=0, warm_start=False)

In [191]:
def printLoss(y_tr,p_tr,y_te,p_te,loss=log_loss):
    print("Test MVA predictions on test and training set:\n")
    print("Log loss on training set: ", loss(y_tr,p_tr))
    print("Log loss on test set: ", loss(y_te,p_te))

In [205]:
probs_lr_train = lr.predict_proba(lr_enc.transform(x_train.drop(train_drop,axis=1)))
probs_lr_val = lr.predict_proba(lr_enc.transform(x_val.drop(train_drop,axis=1)))

In [206]:
printLoss(y_train, probs_lr_train, y_val, probs_lr_val)

Test MVA predictions on test and training set:

Log loss on training set:  1.03844756295
Log loss on test set:  1.04832763126


# Average the predictions in groups of `device_id`

In [194]:
def averagePredictions(preds, ids, label=None):
    df = pd.DataFrame(preds)
    df.columns = enc_group.inverse_transform(df.columns)
    df["device_id"] = ids
    if not label == None:
        df["group"] = label
    df = df.groupby("device_id", sort=False, as_index=False).agg(np.mean)
    return df

## As the total sum of our predictions do not need to add up to 1, we don't have to worry about normalization.

In [207]:
av_pred_val = averagePredictions(probs_lr_val,x_val.device_id.values, x_val.group.values)
av_pred_train = averagePredictions(probs_lr_train,x_train.device_id.values, x_train.group.values)



In [208]:
printLoss(av_pred_train.group, av_pred_train.drop(train_drop,axis=1).as_matrix()
          , av_pred_val.group, av_pred_val.drop(train_drop,axis=1).as_matrix())

Test MVA predictions on test and training set:

Log loss on training set:  2.06649951822
Log loss on test set:  2.02609505029


# Save final logistic regression mode

In [209]:
joblib.dump(lr, "trainedModels/lr_evts.pkl", compress=3)

['trainedModels/lr_evts.pkl']

# Predict actual test devices

In [198]:
probs_test = lr.predict_proba(lr_enc.transform(test.drop("device_id",axis=1)))

In [199]:
av_probs_test = averagePredictions(probs_test
                                   , test.device_id.values)

In [201]:
av_probs_test.to_csv("finalOutputs/lr_Evts.csv", index=False)

# Gradient boosting classifier

In [210]:
gbdt = GradientBoostingClassifier(loss='deviance'
                                  ,max_features=None
                                  , min_samples_leaf=800
                                  , learning_rate=0.001
                                  , n_estimators=700);

In [None]:
s=time.time()/60.0
gbdt.fit(lr_enc.transform(x_train.drop(train_drop,axis=1)),y_train)
print("gradient boosting done in ", s-(time.time()/60.0))