In [1]:
import csv
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
from scipy import linalg
import statsmodels.regression.linear_model as lm
from sklearn import datasets
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC, RandomizedLasso, lasso_stability_path, ElasticNet, Lasso, Ridge
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc, precision_recall_curve, mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.utils.extmath import pinvh
from sklearn.exceptions import ConvergenceWarning
import warnings
from zipfile import ZipFile
from datetime import datetime
import time

supports = {}
thresholds = np.logspace(-3,-1,3)
clf_names = ['OLS','LASSO', 'Ridge','ElasticNet']
results = pd.DataFrame(index=thresholds, columns=clf_names)
alphas = pd.DataFrame(index=thresholds, columns=clf_names[1:])
plt.interactive(True)
best_thresholds = {}


In [2]:
def filter_data(background):
	nRow = len(background) 
	nCol = len(background.iloc[0,:])

	X_all = background.drop(['challengeID', 'idnum'], axis=1)
	X_train = X_all.iloc[:2121,:]

	Y_train = pd.read_csv("train.csv", low_memory=False)
	return X_train, Y_train

def get_data_for_characteristic(X_train, Y_train, characteristic):
	all_char = Y_train[characteristic] #This is a Series

	#Remove rows where grit is NA
	char_defined = np.where(all_char.notnull())
	char = all_char.iloc[char_defined]
	X_train_char = X_train.iloc[char_defined]

	return X_train_char, char

def feature_selection(X, y):
	print "Feature Selection"
	X = X.as_matrix()
	y = y.as_matrix()
	with warnings.catch_warnings():
		warnings.simplefilter('ignore', UserWarning)
		warnings.simplefilter('ignore', ConvergenceWarning)
		lasso = RandomizedLasso(alpha='aic', random_state=39, n_resampling=500)
		lasso.fit(X,y)		
		#plot_stability_path()

	return lasso

def plot_stability_path():
	plt.figure()
	# We plot the path as a function of alpha/alpha_max to the power 1/3: the
	# power 1/3 scales the path less brutally than the log, and enables to
	# see the progression along the path
	print "\tLasso Stability Path"
	alpha_grid, scores_path = lasso_stability_path(X, y, random_state=43, eps=0.05)

	hg = plt.plot(alpha_grid[1:] ** .333, scores_path[coef != 0].T[1:], 'r')
	hb = plt.plot(alpha_grid[1:] ** .333, scores_path.T[1:], 'k')
	ymin, ymax = plt.ylim()
	plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
	plt.ylabel('Stability score: proportion of times selected')
	plt.title('Stability Scores Path')# - Mutual incoherence: %.1f' % mi)
	plt.axis('tight')
	plt.legend((hg[0], hb[0]), ('relevant features', 'irrelevant features'),
	           loc='best')
	plt.show()

def gen_grid(X,y,background):
	global supports, results, alphas, best_thresholds
	supports = {}
	results = pd.DataFrame(index=thresholds, columns=clf_names)
	alphas = pd.DataFrame(index=thresholds, columns=clf_names[1:])
	best_thresholds = {}

	randomized_lasso = feature_selection(X,y)
	stability_scores = randomized_lasso.scores_

	for threshold in thresholds:
		support = np.where(stability_scores > threshold)[0]
		print threshold, '\t', str(support)
		supports[threshold] = support
		Xf = X.iloc[:,support]
		testf = background.drop(['challengeID','idnum'],axis=1).iloc[:,support]

		with warnings.catch_warnings():
			warnings.simplefilter('ignore', UserWarning)
			warnings.simplefilter('ignore', ConvergenceWarning)
			#OLS

			print "\tOLS"
			ols_fit = lm.OLS(y, Xf).fit()
			results.ix[threshold,'OLS'] = ols_fit.mse_resid
			#print "OLS"

			param_grid = dict(alpha=np.logspace(-6,0,7))
			cv = StratifiedKFold(n_splits=5, random_state=42)

			#LASSO
			print "\tLASSO"
			lasso_grid = GridSearchCV(Lasso(), param_grid=param_grid, cv=cv, scoring='neg_mean_squared_error')
			lasso_grid.fit(Xf,y)
			results.ix[threshold,'LASSO'] = lasso_grid.best_score_
			alphas.ix[threshold,'LASSO'] = lasso_grid.best_params_['alpha']
			
			#Ridge
			print "\tRidge"
			ridge_grid = GridSearchCV(Ridge(), param_grid=param_grid, cv=cv, scoring='neg_mean_squared_error')
			ridge_grid.fit(Xf,y)
			results.ix[threshold,'Ridge'] = ridge_grid.best_score_
			alphas.ix[threshold,'Ridge'] = ridge_grid.best_params_['alpha']
			
			#Elastic
			print "\tElasticNet"
			elastic_grid = GridSearchCV(ElasticNet(), param_grid=param_grid, cv=cv, scoring='neg_mean_squared_error')
			elastic_grid.fit(Xf,y)
			results.ix[threshold,'ElasticNet'] = elastic_grid.best_score_
			alphas.ix[threshold,'ElasticNet'] = elastic_grid.best_params_['alpha']
	results = results.abs()

	for name in clf_names:
		best_thresholds[name] = results[name].idxmin()
	return results, alphas

def make_threshold_plot():
	plt.figure()

	plt.xscale('log')

	ols_plot = plt.plot(thresholds, results['OLS'], 'r')
	lasso_plot = plt.plot(thresholds, results['LASSO'], 'b')
	ridge_plot = plt.plot(thresholds, results['Ridge'], 'o')
	elastic_plot = plt.plot(thresholds, results['ElasticNet'], 'g')

	plt.xlabel('Stability score: proportion of times selected')
	plt.ylabel('Mean squared error on training data')
	plt.title('Mean squared error as a function of RandomizedLasso threshold')# - Mutual incoherence: %.1f' % mi)
	plt.axis('tight')
	plt.legend((ols_plot[0], lasso_plot[0], ridge_plot[0], elastic_plot[0]), ('OLS', 'LASSO', 'Ridge', 'ElasticNet'),
	           loc='best')
	plt.show()

def generate_all_predictions(X,y,background,characteristic): #Generates predictions from the 4 classifiers for a characteristic
	clfs = [
	lm.OLS(y, X.iloc[:,supports[best_thresholds['OLS']]]).fit(),
	Lasso(alpha=alphas.ix[best_thresholds['LASSO'],'LASSO']).fit(X.iloc[:,supports[best_thresholds['LASSO']]],y),
	Ridge(alpha=alphas.ix[best_thresholds['Ridge'],'Ridge']).fit(X.iloc[:,supports[best_thresholds['Ridge']]],y),
	ElasticNet(alpha=alphas.ix[best_thresholds['ElasticNet'],'ElasticNet']).fit(X.iloc[:,supports[best_thresholds['ElasticNet']]],y)
	]
	predictions = {}
	for (clf,name) in zip(clfs,clf_names):
		predictions[name] = clf.predict(background.drop(['challengeID','idnum'],axis=1).iloc[:,supports[best_thresholds[name]]])
	return predictions


def gen_submission(pred, name=""):
	pred.to_csv("prediction" + name + ".csv", index=False)
	with ZipFile( str('Submission' + name + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '.zip'), 'w') as myzip:
		myzip.write('prediction.csv')
		myzip.write('narrative.txt')
		myzip.write('ffc.py')

In [3]:
background = pd.read_csv("output.csv", low_memory=False)
background.sort_values(by='challengeID', inplace=True)
background.index = background['challengeID'] - 1
prediction = pd.read_csv("prediction_old.csv", low_memory=False)
X_all,y_all = filter_data(background)

ols_prediction = prediction.copy(deep=True)
lasso_prediction = prediction.copy(deep=True)
ridge_prediction = prediction.copy(deep=True)
elastic_prediction = prediction.copy(deep=True)


In [4]:
for characteristic in ['grit', 'gpa', 'materialHardship']:
	print characteristic
	X,y = get_data_for_characteristic(X_all, y_all, characteristic)
	results,alphas = gen_grid(X,y,background)
	results.to_csv("scores_" + characteristic + ".csv", index=False)
	alphas.to_csv("alphas_" + characteristic + ".csv", index=False)
	#make_threshold_plot()
	print "Generating for " + characteristic
	predictions = generate_all_predictions(X,y,background,characteristic)

	ols_prediction[characteristic] = predictions['OLS']
	lasso_prediction[characteristic] = predictions['LASSO']
	ridge_prediction[characteristic] = predictions['Ridge']
	elastic_prediction[characteristic] = predictions['ElasticNet']

grit
Feature Selection
0.001 	[  635  1356  2002  2846  3023  3057  3771  4052  4055  4080  4125  4128
  4804  4809  4864  5872  6050  6084  6087  6090  6091  6092  6096  6352
  6356  6365  6368  6371  6374  7210  9829  9921 10578 10593 10600 11014
 11596]
	OLS
	LASSO




	Ridge
	ElasticNet
0.01 	[  635  1356  3023  3771  4809  6090  6091  6092  6365  6368  6374  9829
 10578 10593 11014]
	OLS
	LASSO
	Ridge
	ElasticNet
0.1 	[3771 6090 6091 6365 9829]
	OLS
	LASSO
	Ridge
	ElasticNet
Generating for grit
gpa
Feature Selection
0.001 	[  272   423   427   431   616   678   973  1009  1040  1517  1753  1965
  2002  2005  2245  2280  2423  3108  3341  3878  3879  3884  4025  4210
  4383  5388  5582  5826  6088  6143  6237  6520  6635  6737  6838  6906
  7033  7231  7242  7247  7248  7259  7295  7297  7298  7299  7300  7301
  7306  7315  7320  7322  7707  7750  7757  7890  7901  8022  8090  8350
  8351  8813  8861  9013  9143  9169  9527  9955  9978 10020 10025 10055
 10065 10572 10594 10602 10822 10860 10995 11453 11474 11626 11731 11909
 11926]
	OLS
	LASSO
	Ridge
	ElasticNet
0.01 	[ 3108  3884  4210  6088  7033  7295  7297  7298  7299  7300  7306  7757
  7890  8351  9013  9978 10025 10594 10995 11453 11474 11626]
	OLS
	LASSO
	Ridge
	ElasticNet
0.1 	[ 6088  7033



	Ridge
	ElasticNet
0.01 	[ 3783  4244  4452  4453  4456  4467  5720  6026  7655  8082  9189 11087
 11347 11353 11946 11953]
	OLS
	LASSO
	Ridge
	ElasticNet
0.1 	[ 6026 11353 11946]
	OLS
	LASSO
	Ridge
	ElasticNet
Generating for materialHardship


In [5]:
ols_prediction

Unnamed: 0,challengeID,gpa,grit,materialHardship,eviction,layoff,jobTraining
0,1,2.905458,3.337634,0.082938,0.05963,0.209084,0.234771
1,2,2.837803,3.395756,0.098797,0.05963,0.209084,0.234771
2,3,3.088654,3.478301,0.092394,0.05963,0.209084,0.234771
3,4,3.181070,3.425541,0.085012,0.05963,0.209084,0.234771
4,5,3.080537,3.621434,0.089748,0.05963,0.209084,0.234771
5,6,2.567606,3.404209,0.095668,0.05963,0.209084,0.234771
6,7,3.176115,2.920297,0.030595,0.05963,0.209084,0.234771
7,8,2.845187,3.340898,0.082938,0.05963,0.209084,0.234771
8,9,3.367925,3.262693,0.129072,0.05963,0.209084,0.234771
9,10,2.897550,3.454615,0.119151,0.05963,0.209084,0.234771


In [6]:
lasso_prediction

Unnamed: 0,challengeID,gpa,grit,materialHardship,eviction,layoff,jobTraining
0,1,2.931330,3.437998,0.078189,0.05963,0.209084,0.234771
1,2,2.788270,3.454303,0.090237,0.05963,0.209084,0.234771
2,3,3.063551,3.502448,0.087671,0.05963,0.209084,0.234771
3,4,3.207244,3.454396,0.080511,0.05963,0.209084,0.234771
4,5,3.124901,3.629637,0.085115,0.05963,0.209084,0.234771
5,6,2.589925,3.438365,0.090098,0.05963,0.209084,0.234771
6,7,3.251962,3.187742,0.053992,0.05963,0.209084,0.234771
7,8,2.849554,3.332593,0.078189,0.05963,0.209084,0.234771
8,9,3.360929,3.301985,0.126917,0.05963,0.209084,0.234771
9,10,2.886445,3.371426,0.114442,0.05963,0.209084,0.234771


In [7]:
ridge_prediction

Unnamed: 0,challengeID,gpa,grit,materialHardship,eviction,layoff,jobTraining
0,1,2.937922,3.437674,0.085765,0.05963,0.209084,0.234771
1,2,2.758680,3.453040,0.087221,0.05963,0.209084,0.234771
2,3,3.087416,3.501520,0.091056,0.05963,0.209084,0.234771
3,4,3.213407,3.452337,0.083854,0.05963,0.209084,0.234771
4,5,3.072431,3.628651,0.089124,0.05963,0.209084,0.234771
5,6,2.569210,3.438415,0.099012,0.05963,0.209084,0.234771
6,7,3.261375,3.193448,0.061397,0.05963,0.209084,0.234771
7,8,2.863874,3.336842,0.085765,0.05963,0.209084,0.234771
8,9,3.378164,3.299039,0.088420,0.05963,0.209084,0.234771
9,10,2.919115,3.372859,0.083808,0.05963,0.209084,0.234771


In [8]:
elastic_prediction

Unnamed: 0,challengeID,gpa,grit,materialHardship,eviction,layoff,jobTraining
0,1,2.933247,3.438139,0.077919,0.05963,0.209084,0.234771
1,2,2.770433,3.454408,0.090067,0.05963,0.209084,0.234771
2,3,3.074555,3.501966,0.087401,0.05963,0.209084,0.234771
3,4,3.213679,3.453842,0.080239,0.05963,0.209084,0.234771
4,5,3.106506,3.629453,0.084848,0.05963,0.209084,0.234771
5,6,2.582054,3.438219,0.089811,0.05963,0.209084,0.234771
6,7,3.258685,3.193884,0.053691,0.05963,0.209084,0.234771
7,8,2.852581,3.334199,0.077919,0.05963,0.209084,0.234771
8,9,3.372936,3.301325,0.126675,0.05963,0.209084,0.234771
9,10,2.900510,3.372363,0.114170,0.05963,0.209084,0.234771


In [19]:
def gen_submission(pred, name=""):
	pred_filename = "prediction" + name + ".csv" 
	pred.to_csv(pred_filename, index=False)
	with ZipFile( str('Submission' + name + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '.zip'), 'w') as myzip:
		myzip.write(pred_filename, arcname='prediction.csv')
		myzip.write('narrative.txt')
		myzip.write('ffc.py')

In [20]:
gen_submission(lasso_prediction,"_lasso")

In [21]:
gen_submission(ols_prediction,"_ols")