In [None]:
import pandas as pd
import numpy as np
import pickle 
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV, BayesianRidge, HuberRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR, LinearSVR
from sklearn.preprocessing import RobustScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from datetime import datetime
import os

In [None]:
#levanto data sets
df = pd.read_csv("./cleanedData.csv")
y = df.precio

X = pd.read_csv("./xgboost-x.csv")

In [None]:
aux = X.join(y)
aux = aux.dropna()
aux = aux.sample(n=500, random_state=42)

In [None]:
y_aux = aux.precio
X_aux = aux.drop(['precio'], axis=1, inplace=False)

In [None]:
y_test = pd.DataFrame(y_aux)

In [None]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_aux, y_aux, test_size=test_size, random_state=seed)
# fit model no training data

In [None]:
def add_more_features(self,df):
		df['patio'] = df.metrostotales - df.metroscubiertos
		df['ambientes'] = df.habitaciones + df.banos + df.garages
		#df['prom_amb'] = df.metroscubiertos / df.ambientes
		#df['construccion_density'] = df.metroscubiertos/df.metrostotales
		return df

In [None]:
def prepare_data(self):
		x_cols = [x for x in self.df_train.columns if x != 'precio' and x != 'id']
		y_train = np.log1p(self.df_train['precio'])

		x_train = self.df_train.loc[:,x_cols]
		self.x_test = self.df_test.loc[:,x_cols]
		
		#scaler = RobustScaler()
		#x_train = scaler.fit_transform(x_train)
		#self.x_test = scaler.transform(self.x_test)


		x_tr,x_val,y_tr,y_val = train_test_split(x_train,y_train,test_size=0.15,shuffle=True)

		return (x_tr,y_tr),(x_val,y_val)

In [None]:
def save_prediction(self,y_test,model):
		final_pred = np.expm1(y_test)

		ids = self.df_test['id'].values
		try:
			os.mkdir('predictions')
		except:
			pass


		submit = pd.DataFrame({'id':ids,'target':final_pred})
		submit.to_csv('predictions/submit-'+model+'.csv',index=False)

In [None]:
def timer(self, start_time=None):
		if not start_time:
			start_time = datetime.now()
			return start_time
		elif start_time:
			thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
			tmin, tsec = divmod(temp_sec, 60)
			print('Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

## Lasso

In [None]:
def train_LassoCV(self,data):
		train,validacion = data
		x_tr,y_tr = train
		x_val,y_val = validacion
		#print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
		#print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

		print('Start training LassoCV...')
		start_time = self.timer()

		Lasso = LassoCV(
			n_alphas=1000,
			cv=10,
			normalize=True,

			)
		Lasso.fit(x_tr,y_tr)
		print("The R2 is: {}".format(Lasso.score(x_tr,y_tr)))
		print("The alpha choose by CV is:{}".format(Lasso.alpha_))
		self.timer(start_time)

		print("Making prediction on validation data")
		y_val = np.expm1(y_val)
		y_val_pred = np.expm1(Lasso.predict(x_val))
		mae = mean_absolute_error(y_val,y_val_pred)
		print("El mean absolute error de es {}".format(mae))

		
		print('Saving model into a pickle')
		try:
			os.mkdir('pickles')
		except:
			pass

		with open('pickles/LassoCV.pkl','wb') as f:
			pickle.dump(Lasso, f)

		print('Making prediction and saving into a csv')
		y_test= Lasso.predict(self.x_test)

		return y_test

In [None]:
y_test = self.train_LassoCV(data)
		self.save_prediction(y_test,'Lasso')

# Ridge

In [None]:
def train_rigdeCV(self,data):
		train,validacion = data
		x_tr,y_tr = train
		x_val,y_val = validacion
		#print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
		#print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

		print('Start training Ridge...')
		start_time = self.timer()

		ridge = RidgeCV(
			normalize=True,
			alphas=[0.0000999],
			cv=10
			)
		ridge.fit(x_tr,y_tr)
		print("The R2 is: {}".format(ridge.score(x_tr,y_tr)))
		print("The alpha choose by CV is:{}".format(ridge.alpha_))
		self.timer(start_time)

		print("Making prediction on validation data")
		y_val = np.expm1(y_val)
		y_val_pred = np.expm1(ridge.predict(x_val))
		mae = mean_absolute_error(y_val,y_val_pred)
		print("El mean absolute error de es {}".format(mae))

		
		print('Saving model into a pickle')
		try:
			os.mkdir('pickles')
		except:
			pass

		with open('pickles/RidgeCV.pkl','wb') as f:
			pickle.dump(ridge, f)

		print('Making prediction and saving into a csv')
		y_test= ridge.predict(self.x_test)

		return y_test


In [None]:
y_test = self.train_rigdeCV(data)
		self.save_prediction(y_test,'Ridge')

# Elastic Net

In [None]:
	def train_elasticNetCV(self,data):
		train,validacion = data
		x_tr,y_tr = train
		x_val,y_val = validacion
		#print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
		#print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

		print('Start training ElasticNetCV...')
		start_time = self.timer()

		enet = ElasticNetCV(
			normalize=True,
			n_alphas=2000,
			max_iter = 2000,
			cv=10
			)
		enet.fit(x_tr,y_tr)
		print("The R2 is: {}".format(enet.score(x_tr,y_tr)))
		print("The alpha choose by CV is:{}".format(enet.alpha_))
		self.timer(start_time)

		print("Making prediction on validation data")
		y_val = np.expm1(y_val)
		y_val_pred = np.expm1(enet.predict(x_val))
		mae = mean_absolute_error(y_val,y_val_pred)
		print("El mean absolute error de es {}".format(mae))

		
		print('Saving model into a pickle')
		try:
			os.mkdir('pickles')
		except:
			pass

		with open('pickles/enetCV.pkl','wb') as f:
			pickle.dump(enet, f)

		print('Making prediction and saving into a csv')
		y_test= enet.predict(self.x_test)

		return y_test

In [None]:
y_test = self.train_elasticNetCV(data)
		self.save_prediction(y_test,'elastic')