In [8]:
import csv
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [9]:
def load_data(filename):
    return np.loadtxt(open(filename), delimiter=',', skiprows=1)

In [10]:
# Load training data
data = load_data('train_2008.csv')
X = data[:, 1:-1]
y = data[:, -1]

In [11]:
N = len(y) #64667
D = len(X[0]) #381

# Split to train / test (validation) data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
X_sample, y_sample = X_train[:1000], y_train[:1000]

In [13]:
# Load real test data
test_data = load_data('test_2008.csv')
X_test_real = test_data[:, 1:]

In [37]:
#Gradient Boosting 
gbr = GradientBoostingRegressor(max_depth = 4, n_estimators = 200)
gbr.fit(X_train, y_train)
y_score_gbr = gbr.predict(X_test)

In [72]:
roc_auc_score(y_test, y_score_gbr)

0.7830223461753073

In [18]:
# Logistic Regression
lgr = LogisticRegression(C = 0.1, penalty = 'l1')
lgr.fit(X_train, y_train)
pred_prob = lgr.predict_proba(X_test)
y_scores_lgr = pred_prob[:,1]



In [19]:
# Random Forest Regression
regr = RandomForestRegressor(n_estimators = 100, max_features = 26)
regr.fit(X_train, y_train)
y_score_regr = regr.predict(X_test)

In [21]:
def scale_scores(y_score):
    max_score, min_score = max(y_score), min(y_score)
    min_capped = max_score - (max_score - y_score) * (max_score) / (max_score - min_score)
    return min_capped / max_score

In [75]:
# average of the three
y_test_scores = 0.70*y_score_gbr + 0.05*y_scores_lgr + 0.25*y_score_regr
y_test_scores = scale_scores(y_test_scores)
roc_auc_score(y_test, y_test_scores)

0.7856800300775839

gbr = GradientBoostingRegressor()
gbr.fit(X, y)

test_data = load_data('test_2008.csv')
X_test_real = test_data[:, 1:]

y_test_scores = gbr.predict(X_test_real)

In [20]:
y_test_scores = y_test_scores.clip(min=0)

In [70]:
with open ('sub2.csv', 'w') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow(['id', 'target'])
    for i in range(0, 16000):
        writer.writerow([i, y_test_scores[i]])

In [10]:
# Gradient Boosting Regressor with Grid Search
gbr = GradientBoostingRegressor()

param_grid={'n_estimators':[100,200,500], 
            'learning_rate': [0.1,0.05,0.02],
            'max_depth':[3,4,5,6], 
            'min_samples_leaf':[1,2,3], 
            'max_features':[None,1.0] } 

gbr = GridSearchCV(estimator=gbr, cv=5, param_grid=param_grid, 
    n_jobs=4)

gbr.fit(X, y)
y_score_gbr = gbr.predict(X_test)

print(roc_auc_score(y_test, y_score_gbr))

print(gbr.best_params_)

0.7396674380269395
{'learning_rate': 0.02, 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 3, 'n_estimators': 200}


In [14]:
gbr = GradientBoostingRegressor(max_depth = 5)
gbr.fit(X_train, y_train)
y_score_gbr = gbr.predict(X_test)
print(roc_auc_score(y_test, y_score_gbr))

0.7685283576024072
