In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import xgboost

In [46]:
from sklearn.utils import shuffle
from numpy import sort
from matplotlib import pyplot
from xgboost import plot_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, classification_report, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve

In [47]:
devices0 = pd.read_csv("devices_featured0_final.csv", low_memory = False)
devices1 = pd.read_csv("devices_featured1_final.csv", low_memory = False)
devices2 = pd.read_csv("devices_featured2_final.csv", low_memory = False)
devices3 = pd.read_csv("devices_featured3_final.csv", low_memory = False)

In [48]:
devices0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166029 entries, 0 to 166028
Data columns (total 52 columns):
ref_hash                         166029 non-null int64
seconds_to_conversion            166029 non-null float64
installs_not_implicit_percent    166029 non-null float64
installs_implicit_percent        166029 non-null float64
installs_day_1                   166029 non-null float64
installs_day_2                   166029 non-null float64
installs_day_3                   166029 non-null float64
total_installs                   166029 non-null float64
installs_day_1_percent           166029 non-null float64
installs_day_2_percent           166029 non-null float64
installs_day_3_percent           166029 non-null float64
total_installs_normalized        166029 non-null float64
events_day_1                     166029 non-null int64
events_day_2                     166029 non-null int64
events_day_3                     166029 non-null int64
total_events                     166029 no

In [49]:
sec_to_conv0 = np.array(devices0['seconds_to_conversion'])
sec_to_conv1 = np.array(devices1['seconds_to_conversion'])
sec_to_conv2 = np.array(devices2['seconds_to_conversion'])
sec_to_conv3 = np.array(devices3['seconds_to_conversion'])
sec_to_auct0 = np.array(devices0['seconds_to_auction'])
sec_to_auct1 = np.array(devices1['seconds_to_auction'])
sec_to_auct2 = np.array(devices2['seconds_to_auction'])
sec_to_auct3 = np.array(devices3['seconds_to_auction'])
data_array0 = np.array(devices0.drop(columns=['seconds_to_conversion', 'ref_hash', 'seconds_to_auction']))
data_array1 = np.array(devices1.drop(columns=['seconds_to_conversion', 'ref_hash', 'seconds_to_auction']))
data_array2 = np.array(devices2.drop(columns=['seconds_to_conversion', 'ref_hash', 'seconds_to_auction']))
data_array3 = np.array(devices3.drop(columns=['seconds_to_conversion', 'ref_hash', 'seconds_to_auction']))

In [50]:
x_train0, x_test0, y_train0, y_test0 = train_test_split(data_array0, sec_to_conv0, test_size = 0.30, random_state = 1)
x_train1, x_test1, y_train1, y_test1 = train_test_split(data_array1, sec_to_conv1, test_size = 0.30, random_state = 2)
x_train2, x_test2, y_train2, y_test2 = train_test_split(data_array2, sec_to_conv2, test_size = 0.30, random_state = 3)
x_train3, x_test3, y_train3, y_test3 = train_test_split(data_array3, sec_to_conv3, test_size = 0.30, random_state = 4)

x_train4, x_test4, y_train4, y_test4 = train_test_split(data_array0, sec_to_auct0, test_size = 0.30, random_state = 5)
x_train5, x_test5, y_train5, y_test5 = train_test_split(data_array1, sec_to_auct1, test_size = 0.30, random_state = 6)
x_train6, x_test6, y_train6, y_test6 = train_test_split(data_array2, sec_to_auct2, test_size = 0.30, random_state = 7)
x_train7, x_test7, y_train7, y_test7 = train_test_split(data_array3, sec_to_auct3, test_size = 0.30, random_state = 8)

In [51]:
def logisticReg(x_train, x_test, y_train, y_test):
    logReg = LogisticRegression(solver='sag', random_state=1)
    logReg.fit(x_train, y_train)
    y_pred = logReg.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [52]:
def knn(k, x_train, x_test, y_train, y_test):
    knn = KNeighborsRegressor(n_neighbors=k, metric='manhattan')
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [53]:
def decisionTree(max_depth, max_features, x_train, x_test, y_train, y_test):
    dt = DecisionTreeRegressor(max_depth=max_depth, max_features=max_features)
    dt.fit(x_train, y_train)
    y_pred = dt.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [54]:
def xgboost(max_depth, n_estimators, x_train, x_test, y_train, y_test):
    xgb = XGBRegressor(max_depth=max_depth, n_estimators=n_estimators)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [55]:
def adaboost(nEstimators, learningRate, x_train, x_test, y_train, y_test):
    ada = AdaBoostRegressor(n_estimators=nEstimators, learning_rate=learningRate, random_state=0)
    ada.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [56]:
def bagging(baseEstimator, nEstimators, learningRate, max_feature, bootstrap, bootstrap_feature, x_train, x_test, y_train, y_test):
    bag = BaggingRegressor(base_estimator=base_estimator, n_estimators=nEstimators, max_features=max_feature, bootstrap=bootstrap, bootstrap_features=bootstrap_feature, random_state=seed)
    bag.fit(x_train, y_train)
    y_pred = bag.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [57]:
def randomForest(n_estimators, max_depth, x_train, x_test, y_train, y_test):
    rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [58]:
random_forest_grid_param = {'n_estimators': [3, 10, 20, 40, 50, 70, 80, 100, 120, 160, 200, 250, 300],
               'max_features': ['auto', 'log2'],
               'max_depth': [3, 5, 7, 9, 15, 30, 40, 50, 60, 70, 80, 90, 100],
               'min_samples_split': [2, 5, 8, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

In [59]:
xgb_grid_param = {
    'max_depth': [3, 5, 7, 9], #[3,4,5,6,7,8,9], # 5 is good but takes too long in kaggle env
    'subsample': [0.6, 0.8], #[0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    'colsample_bytree': [0.6, 0.8], #[0.5,0.6,0.7,0.8],
    'n_estimators': [100, 300, 500, 700], #[1000,2000,3000]
    'learning_rate': [0.05, 0.1]}

In [65]:
resultknn = knn(3,x_train0, x_test0, y_train0, y_test0)

In [67]:
resultDT = decisionTree(10, 'auto', x_train0, x_test0, y_train0, y_test0)

In [68]:
resultDT

40685.070109240674