In [1]:
import pandas as pd
from pandas import DataFrame as df
import seaborn as sns
import sys
import numpy as np
import time

%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import (LabelEncoder, OneHotEncoder
                                   , RobustScaler, StandardScaler)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

sns.set_style('ticks')
sys.path.append("/home/mschlupp/pythonTools")

In [3]:
data = pd.read_csv("files/finalSets/apps_full.csv", nrows=2)

In [8]:
data.head()

Unnamed: 0,UniqueLabel,device_id,hour,usageDay,isTrain,group,nActiveApps,nInstallApps,device_model,nEvts
0,6,-6401643145415154744,-2.075058,3,0,12,-1.018037,-0.293883,367,-0.139568
1,5,-6401643145415154744,-2.075058,3,0,12,-1.018037,-0.293883,367,-0.139568
2,6,-6401643145415154744,-2.075058,3,0,12,-1.018037,-0.293883,367,-0.139568
3,13,-6401643145415154744,-2.075058,3,0,12,-1.018037,-0.293883,367,-0.139568
4,18,-6401643145415154744,-2.075058,3,0,12,-1.018037,-0.293883,367,-0.139568


## Variables to use in the model
* categories: [`event_id`, `UniqueLabel`, `usageDay`, `phone_brand`, `device_model`, `nCats`]
   * this results in memory issues (too many dimensions)
   * drop `event_id`, `nCats`, `phone_brand`
* `hour`
* `nInstallApps`
* `nActiveApps`
* `nEvts`


In [2]:
cols = ["UniqueLabel", "device_id","hour","usageDay","isTrain","group"
        ,"nActiveApps","nInstallApps","device_model","nEvts"]

In [3]:
data = pd.read_csv("files/finalSets/apps_full.csv", usecols=cols)

In [4]:
scaler_nAct = StandardScaler()
scaler_nInst = StandardScaler()
scaler_hour = StandardScaler()
scaler_nevts = StandardScaler()
data["nActiveApps"] = scaler_nAct.fit_transform(data.nActiveApps.reshape(-1, 1))
data["nEvts"] = scaler_nevts.fit_transform(data.nEvts.reshape(-1, 1))
data["nInstallApps"] = scaler_nInst.fit_transform(data.nInstallApps.reshape(-1, 1))
data["hour"] = scaler_hour.fit_transform(data.hour.reshape(-1,1))



In [5]:
#enc_brand = LabelEncoder()
enc_device = LabelEncoder()
enc_group = LabelEncoder()
enc_label = LabelEncoder()
#data["phone_brand"] = enc_brand.fit_transform(data.phone_brand)
#print("brand done.")
data["device_model"] = enc_device.fit_transform(data.device_model)
print("model done.")
data["group"] = enc_group.fit_transform(data.group)
print("group done.")
data["UniqueLabel"] = enc_label.fit_transform(data.UniqueLabel)
print("label done.")

model done.
group done.
label done.


In [7]:
data.drop(["isTrain","device_id","group"], axis=1).columns

Index(['UniqueLabel', 'hour', 'usageDay', 'nActiveApps', 'nInstallApps',
       'device_model', 'nEvts'],
      dtype='object')

In [11]:
lr_enc = OneHotEncoder(categorical_features=[0,2,5])
#lr_enc.fit(data.drop(["isTrain","device_id","group"],axis=1))
lr_enc = joblib.load("OneHotEncoder.pkl")

In [None]:
joblib.dump(lr_enc, "OneHotEncoder.pkl", compress=3)

In [6]:
drop_list = ["isTrain"]
train = data[data.isTrain==1].drop( drop_list, axis=1)
true_classes = train.group
test = data[data.isTrain==0].drop( drop_list+["group"], axis=1)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train
                                                   , true_classes, test_size=0.5)

In [8]:
train_drop = ["group","device_id"]

In [14]:
lr = LogisticRegression(penalty='l1'
                        , C=0.5 
                        , tol=0.00001 # was 0.0001
                        , solver="liblinear" #'lbfgs'
                        , max_iter=900
                        , warm_start=True
                        #, multi_class='multinomial'
                       )

In [None]:
lr.fit(lr_enc.transform(x_train.drop(train_drop,axis=1)),y_train)

In [22]:
joblib.dump(lr, "trainedModels/lr_App_first.pkl", compress=3)

['trainedModels/lr_App_first.pkl']

In [23]:
def printLoss(y_tr,p_tr,y_te,p_te,loss=log_loss):
    print("Test MVA predictions on test and training set:\n")
    print("Log loss on training set: ", loss(y_tr,p_tr))
    print("Log loss on test set: ", loss(y_te,p_te))

In [26]:
probs_lr_train = lr.predict_proba(lr_enc.transform(x_train.drop(train_drop,axis=1)))
probs_lr_test = lr.predict_proba(lr_enc.transform(x_test.drop(train_drop,axis=1)))

In [28]:
printLoss(y_train, probs_lr_train, y_test, probs_lr_test)

Test MVA predictions on test and training set:

Log loss on training set:  1.96650679674
Log loss on test set:  1.96661161514


In [29]:
def averagePredictions(preds, ids, label=None):
    df = pd.DataFrame(preds)
    df.columns = enc_group.inverse_transform(df.columns)
    df["device_id"] = ids
    if not label == None:
        df["group"] = label
    df = df.groupby("device_id", sort=False, as_index=False).agg(np.mean)
    return df

In [31]:
av_pred_val = averagePredictions(probs_lr_test,x_test.device_id.values, x_test.group.values)
av_pred_train = averagePredictions(probs_lr_train,x_train.device_id.values, x_train.group.values)



In [32]:
printLoss(av_pred_train.group, av_pred_train.drop(train_drop,axis=1).as_matrix()
          , av_pred_val.group, av_pred_val.drop(train_drop,axis=1).as_matrix())

Test MVA predictions on test and training set:

Log loss on training set:  2.36184578646
Log loss on test set:  2.36182222885


# Try other models w/o much encoding

In [None]:
lr.fit(x_train.drop(train_drop,axis=1),y_train)

In [42]:
lr

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=600, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=1e-05, verbose=0, warm_start=False)

In [43]:
probs_lr_train = lr.predict_proba(x_train.drop(train_drop,axis=1))
probs_lr_test = lr.predict_proba(x_test.drop(train_drop,axis=1))
printLoss(y_train, probs_lr_train, y_test, probs_lr_test)

Test MVA predictions on test and training set:

Log loss on training set:  2.31551493378
Log loss on test set:  2.31622721104


In [44]:
av_pred_val = averagePredictions(probs_lr_test,x_test.device_id.values, x_test.group.values)
av_pred_train = averagePredictions(probs_lr_train,x_train.device_id.values, x_train.group.values)
printLoss(av_pred_train.group, av_pred_train.drop(train_drop,axis=1).as_matrix()
          , av_pred_val.group, av_pred_val.drop(train_drop,axis=1).as_matrix())



Test MVA predictions on test and training set:

Log loss on training set:  2.4042400584
Log loss on test set:  2.40374309803


# Neural Net

In [45]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

Using Theano backend.


In [63]:
true_classes_nn = x_train.group.values

In [64]:
# we would like to use the categorical_crossentropy (multiclass logloss)
# so let's convert classes to categroies
from keras.utils.np_utils import to_categorical
true_classes_nn = to_categorical(true_classes_nn)
# true_classes now is a binary matrix

In [51]:
len(x_train.drop(train_drop,axis=1).columns)

7

In [52]:
# in keras we need to build models.
# we build our own sequential model
model = Sequential()

# first we add a dense layer (std NN layer)
# we need an output of 12 dimensions
model.add(Dense(output_dim=30, input_dim=7))
model.add(Activation("tanh")) # no real motivation for relu here
model.add(Dropout(0.4))
model.add(Dense(output_dim=30, activation="tanh"))
model.add(Dropout(0.4))
model.add(Dense(output_dim=23, activation="tanh"))
model.add(Dropout(0.4))
model.add(Dense(output_dim=17, activation="tanh"))
model.add(Dropout(0.4))
model.add(Dense(output_dim=12))
model.add(Activation("softmax"))

In [53]:
# now we need to configure the learning process
model.compile(loss='categorical_crossentropy'
              ,optimizer = "adadelta" 
              #optimizer='adam'
              , metrics=['accuracy'])

In [None]:
s = time.time()
model.fit(x_train.drop(train_drop,axis=1).as_matrix(),true_classes_nn
         , verbose=1,nb_epoch=20)
print("NN trained in ", (time.time()-s)/60.0, " minutes")

In [None]:
score = model.evaluate(in_data.as_matrix(), true_classes_nn)