In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.grid_search import RandomizedSearchCV
import xgboost

In [2]:
df = pd.read_csv("./work_dir/boruta_filtered_train_split.csv")

In [3]:
y=df['y']
X=df.drop(['y'], axis=1)

In [4]:
X.shape

(120000, 20)

In [5]:
y.shape

(120000,)

In [6]:
#setup our ML Pipe
scaler = RobustScaler()
logit = LogisticRegression(n_jobs=4)
pipe=Pipeline([('scaler', scaler), ('logit',logit)])

In [7]:
#setup grid search
hyperparameters = { 'logit__C':np.arange(.01, 10, .1)}
search = RandomizedSearchCV(pipe, hyperparameters, cv=5, scoring='roc_auc')

In [8]:
search.fit(X,y)

KeyboardInterrupt: 

In [12]:
search.best_params_

{'logit__C': 3.6099999999999999}

In [13]:
search.best_score_

0.70414830209037904

In [15]:
## Make Prediction to get on the board

In [18]:
kaggle_test = pd.read_csv("./work_dir/my_midterm_kaggle_test.csv")
selected_features = pd.read_csv("./work_dir/feature_support.csv")

In [22]:
kaggle_test_selected = kaggle_test.ix[:, selected_features['0'].values]

In [26]:
prediction = pd.DataFrame(search.best_estimator_.predict_proba(kaggle_test_selected)[:,1])

In [27]:
prediction.columns=['y']

In [29]:
prediction.to_csv("logit_model_prediction.csv", index_label="Id")

In [12]:
import pandas as pd
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import SGD


def create_model(dropout=0.5, lr=0.1):
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=20, init='glorot_uniform', activation='relu'))
    model.add(Dense(10, init='glorot_uniform', activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(10, init='glorot_uniform', activation='relu'))
    model.add(Dense(1, init='glorot_uniform', activation='sigmoid'))
    sgd = SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [13]:
model = create_model()

In [23]:
from sklearn.ensemble import ExtraTreesClassifier
def get_classifiers():
    """
    Creates a list of level 1 learners
    :return: a list of level 1 learners
    """
    etc = ExtraTreesClassifier(n_jobs=6, n_estimators=500, max_features='log2', min_samples_split=1,
                         max_depth=None, criterion='entropy')

    return {'etc':etc} 

In [25]:
a = get_classifiers()
for k,v in a.items():
    print(k)

etc


In [20]:
etc = ExtraTreesClassifier(n_jobs=6, n_estimators=500, max_features='log2', min_samples_split=1,
                         max_depth=None, criterion='entropy')

In [21]:
etc._

'classifier'

In [29]:
X.corr().mean()

x2             0.053643
x6             0.056862
x7             0.048127
x12            0.048631
x20            0.048343
x23            0.048493
x27            0.049172
x28            0.048259
x32            0.049799
x37            0.063701
x38            0.053389
x40            0.050131
x41            0.052818
x42            0.051128
x46            0.053175
x48            0.049615
x49            0.049944
x24_america    0.013189
x24_asia      -0.028924
x24_euorpe     0.025066
dtype: float64

In [32]:
np.arange(.01, 10, .1)

array([ 0.01,  0.11,  0.21,  0.31,  0.41,  0.51,  0.61,  0.71,  0.81,
        0.91,  1.01,  1.11,  1.21,  1.31,  1.41,  1.51,  1.61,  1.71,
        1.81,  1.91,  2.01,  2.11,  2.21,  2.31,  2.41,  2.51,  2.61,
        2.71,  2.81,  2.91,  3.01,  3.11,  3.21,  3.31,  3.41,  3.51,
        3.61,  3.71,  3.81,  3.91,  4.01,  4.11,  4.21,  4.31,  4.41,
        4.51,  4.61,  4.71,  4.81,  4.91,  5.01,  5.11,  5.21,  5.31,
        5.41,  5.51,  5.61,  5.71,  5.81,  5.91,  6.01,  6.11,  6.21,
        6.31,  6.41,  6.51,  6.61,  6.71,  6.81,  6.91,  7.01,  7.11,
        7.21,  7.31,  7.41,  7.51,  7.61,  7.71,  7.81,  7.91,  8.01,
        8.11,  8.21,  8.31,  8.41,  8.51,  8.61,  8.71,  8.81,  8.91,
        9.01,  9.11,  9.21,  9.31,  9.41,  9.51,  9.61,  9.71,  9.81,  9.91])

In [34]:
X.mean(axis=1)

0           4.377173
1          -6.318174
2         -10.624184
3          -3.513195
4          -5.388608
5         124.307649
6         -28.434299
7         -40.065201
8          -2.987872
9           3.673339
10         35.060630
11         77.375120
12         48.842999
13        -19.299708
14        -25.045527
15         52.740109
16         16.529686
17        -55.987271
18         37.004561
19        108.195016
20        -22.993373
21         23.399027
22         34.538817
23        -66.699911
24         48.495650
25         84.033927
26        -23.096254
27         23.344056
28         -2.047649
29         74.197011
             ...    
119970    -77.218140
119971     41.881246
119972     -5.462008
119973     24.259480
119974     50.444370
119975      4.480041
119976    -46.374122
119977    -22.456322
119978    -36.872980
119979      9.049671
119980     14.112884
119981     -7.958670
119982    111.147704
119983    -83.271039
119984    -92.754801
119985     42.909919
119986      7