In [1]:
import pandas as pd
from pandas import DataFrame as df
import seaborn as sns
import sys
import numpy as np
import time

%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

sns.set_style('ticks')
sys.path.append("/home/mschlupp/pythonTools")

In [2]:
# check what we have to do first
evts = pd.read_csv("files/finalSets/evts_noApp_phone.csv", nrows=2)

In [3]:
evts

Unnamed: 0,event_id,device_id,longitude,latitude,day,time,hour,usageDay,isTrain,group,phone_brand,device_model,nEvts
0,1,29182687948017175,121.38,31.24,Sun,00:55:25,0,3,1,M39+,小米,红米note,256
1,3,-4833982096941402721,106.6,29.7,Sun,00:08:05,0,3,1,M39+,魅族,MX4 Pro,248


#### Model variables
We'll build our model based on:
* `device_model`
* `phone_brand`
* `usageDay`
* `hour`
* `nEvts`
* `longitude` and `latitude`

In [4]:
data = pd.read_csv("files/finalSets/evts_noApp_phone.csv")
data = data.drop(["event_id","day","time"],axis=1)

In [11]:
enc_brand = LabelEncoder()
enc_device = LabelEncoder()
enc_group = LabelEncoder()
data["phone_brand"] = enc_brand.fit_transform(data.phone_brand)
data["device_model"] = enc_device.fit_transform(data.device_model)
data["group"] = enc_group.fit_transform(data.group)

scaler_long = RobustScaler()
scaler_lat = RobustScaler()
scaler_nevts = RobustScaler()
data["latitude"] = scaler_lat.fit_transform(data.latitude.reshape(-1,1))
data["longitude"] = scaler_long.fit_transform(data.longitude.reshape(-1,1))
#data["nEvts"] = scaler_nevts.fit_transform(data.nEvts.reshape(-1,1))

lr_enc = OneHotEncoder(categorical_features=[0,1,2,3,4])
lr_enc.fit(data.drop(["isTrain","device_id","group","longitude","latitude"],axis=1))

OneHotEncoder(categorical_features=[0, 1, 2, 3, 4], dtype=<class 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [6]:
data.head(2)

Unnamed: 0,device_id,longitude,latitude,hour,usageDay,isTrain,group,phone_brand,device_model,nEvts
0,29182687948017175,0.9,0.003012,0,3,1,11,24,710,256
1,-4833982096941402721,-1.273529,-0.151606,0,3,1,11,64,340,248


### Split in train and test samples

In [6]:
drop_list = ["isTrain"]
train = data[data.isTrain==1].drop( drop_list, axis=1)
true_classes = train.group
test = data[data.isTrain==0].drop( drop_list+["group"], axis=1)

In [7]:
x_train, x_val, y_train, y_val = train_test_split(train
                                                  , true_classes, test_size=0.3
                                                  ,random_state=999)

In [8]:
train_drop = ["group","device_id"]

In [188]:
x_train.columns

Index(['device_id', 'longitude', 'latitude', 'hour', 'usageDay', 'group',
       'phone_brand', 'device_model', 'nEvts'],
      dtype='object')

# Baseline model: linear logistic regression

In [12]:
lr = LogisticRegression(penalty='l2'
                        , C=0.1 # was 0.05
                        , tol=0.00001 # was 0.0001
                        , solver='lbfgs'
                        , max_iter=600
                        , multi_class='multinomial')

In [13]:
lr.fit(lr_enc.transform(x_train.drop(train_drop+["latitude","longitude"],axis=1)),y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=600, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=1e-05, verbose=0, warm_start=False)

In [14]:
def printLoss(y_tr,p_tr,y_te,p_te,loss=log_loss):
    print("Test MVA predictions on test and training set:\n")
    print("Log loss on training set: ", loss(y_tr,p_tr))
    print("Log loss on test set: ", loss(y_te,p_te))

In [16]:
probs_lr_train = lr.predict_proba(lr_enc.transform(x_train.drop(train_drop+["latitude","longitude"],axis=1)))
probs_lr_val = lr.predict_proba(lr_enc.transform(x_val.drop(train_drop+["latitude","longitude"],axis=1)))

In [17]:
printLoss(y_train, probs_lr_train, y_val, probs_lr_val)

Test MVA predictions on test and training set:

Log loss on training set:  1.04151752526
Log loss on test set:  1.05119874793


# Average the predictions in groups of `device_id`

In [18]:
def averagePredictions(preds, ids, label=None):
    df = pd.DataFrame(preds)
    df.columns = enc_group.inverse_transform(df.columns)
    df["device_id"] = ids
    if not label == None:
        df["group"] = label
    df = df.groupby("device_id", sort=False, as_index=False).agg(np.mean)
    return df

## As the total sum of our predictions do not need to add up to 1, we don't have to worry about normalization.

In [19]:
av_pred_val = averagePredictions(probs_lr_val,x_val.device_id.values, x_val.group.values)
av_pred_train = averagePredictions(probs_lr_train,x_train.device_id.values, x_train.group.values)



In [21]:
printLoss(av_pred_train.group, av_pred_train.drop(train_drop,axis=1).as_matrix()
          , av_pred_val.group, av_pred_val.drop(train_drop,axis=1).as_matrix())

Test MVA predictions on test and training set:

Log loss on training set:  2.07042975619
Log loss on test set:  2.03019174585


# Save final logistic regression mode

In [22]:
joblib.dump(lr, "trainedModels/lr_evts_nEvtsWoLatLong.pkl", compress=3)

['trainedModels/lr_evts_nEvtsWoLatLong.pkl']

# Predict actual test devices

In [23]:
probs_test = lr.predict_proba(lr_enc.transform(test.drop(["device_id","latitude","longitude"],axis=1)))

In [24]:
av_probs_test = averagePredictions(probs_test
                                   , test.device_id.values)

In [25]:
av_probs_test.to_csv("finalOutputs/lr_Evts_woLatLong.csv", index=False)

# Gradient boosting classifier

In [26]:
gbdt = GradientBoostingClassifier(loss='deviance'
                                  ,max_features=None
                                  , min_samples_leaf=800
                                  , learning_rate=0.01
                                  , n_estimators=300);

In [10]:
gbdt = joblib.load("trainedModels/gbdt_evts.pkl")

In [28]:
s=time.time()/60.0
gbdt.fit(x_train.drop(train_drop,axis=1),y_train)
print("gradient boosting done in ", (time.time()/60.0)-s, " minutes.")

gradient boosting done in  62.727684278041124  minutes.


In [34]:
p_gbdt_test = gbdt.predict_proba(test.drop(["device_id"],axis=1))

In [35]:
p_gbdt_val = gbdt.predict_proba(x_val.drop(train_drop,axis=1))
p_gbdt_train = gbdt.predict_proba(x_train.drop(train_drop,axis=1))

In [36]:
printLoss(y_val,p_gbdt_val, y_train, p_gbdt_train)

Test MVA predictions on test and training set:

Log loss on training set:  1.9056459881
Log loss on test set:  1.90565042031


In [38]:
av_val = averagePredictions(p_gbdt_val, x_val.device_id.values, x_val.group.values)
av_train = averagePredictions(p_gbdt_train, x_train.device_id.values, x_train.group.values)



In [39]:
printLoss(av_train.group, av_train.drop(train_drop,axis=1).as_matrix()
          , av_val.group, av_val.drop(train_drop,axis=1).as_matrix())

Test MVA predictions on test and training set:

Log loss on training set:  2.29086334479
Log loss on test set:  2.28474007692


#### This screams for optimisation... for now, we can go with the logreg solution.