In [None]:
import numpy as np
import util
import pickle
import sklearn.linear_model
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile)

seed=229
np.random.seed(seed)

test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Training

In [None]:
logreg_model = sklearn.linear_model.LogisticRegression()
logreg_model.fit(X, Y)

In [None]:
logreg_out = open('models/logreg.pickle', 'wb')
pickle.dump(logreg_model, logreg_out)
logreg_out.close()

# Evaluation

In [None]:
logreg_model = pickle.load(open('models/logreg.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(logreg_model, X_train, X_test, y_train, y_test,
                                                      scorer=util.gini_proba_scorer)
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('Logistic Regression')
plt.legend()
plt.savefig('figures/learning_curves/logisticregression.eps', format='eps', dpi=1000)

In [None]:
plt.show()

# Output

In [None]:
logreg_model.fit(X, Y)
print(util.gini_proba_scorer(logreg_model, X, Y))

In [None]:
util.make_prediction(logreg_model, 'data/test.csv', 'predictions/logreg.csv',
                     predict_method=(lambda X_test: logreg_model.predict_proba(X_test)[:,1]))