In [None]:
import pandas as pd
import numpy as np
from itertools import combinations

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder

SEED = 42

# loading data

test  = pd.read_csv("./test.csv")
train = pd.read_csv("./train.csv")

features = train.columns[(train.columns != 'ACTION')]
X, y, X_test = train[features], train.ACTION, test[features]

print("Loaded data: ", train.shape)
print("Features: ", features)


# feature adding - combinations

def add_combinations(X, degrees):
    combs = combinations(range(X.columns.size), degrees)
    for features_i in combs:
        feature_vals = [ hash(tuple(v)) for v in X.ix[:,features_i].values ]
        feature_name = "comb_" +  '_'.join([str(x) for x in features_i])
        X[feature_name] = feature_vals
    return X

add_combinations(X, 2)
add_combinations(X, 3)
add_combinations(X_test, 2)
add_combinations(X_test, 3)

print("Added combinations: ", X.shape)
print("Features: ", X.columns)


# feature transforming - HotEncoding

enc = OneHotEncoder()
enc.fit(pd.concat([ X, X_test ]))

X = enc.transform(X)
X_test = enc.transform(X_test)

print("One HotEncoding: ", X.shape)


# model evaluation
    



# feature selection




# hyperparameter optimization

params = { "C": np.logspace(-4, 4, 15, base=2) }
p = GridSearchCV(model, param_distributions=params, scoring='roc_auc', n_jobs=-1).fit(X, y)

print("Found best params: ", p)

# predicting

model = LogisticRegression(C=p.best_params_.C)
model = model.train(X, y)
predictions = model.predic_proba(X_test)[:,1]

export = pd.DataFrame({ "id": test.id, "ACTION": predictions })
export.to_csv("./predictions.csv", index=False)

print "Done!"

# evaluation

scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
print "Final score: %f +- %f" % (i, scores.mean(), scores.std()*2)


In [2]:
import numpy as np

print(np.logspace(-4, 4, 15, base=2))





[  0.0625       0.09287464   0.13801119   0.20508384   0.30475341
   0.45286183   0.6729501    1.           1.48599429   2.20817903
   3.28134142   4.87605462   7.24578931  10.76720154  16.        ]


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

clf = GradientBoostingClassifier(max_depth=None, max_features=3, min_samples_leaf=7, min_samples_split=2)

# specify parameters and distributions to sample from
param_dist = {
    "learning_rate": [.03, .05, .1, .2, 1],
    "n_estimators": [20, 40, 80, 150, 300]
}

# run randomized search
n_iter_search = 20
random_search = GridSearchCV(
    clf,
    param_distributions=param_dist,
#    n_iter=n_iter_search,
    scoring='roc_auc',
    n_jobs=-1
)

start = time()
random_search.fit(train[features], train.ACTION)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)
