In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [2]:
path = 'continuous_train.csv'
data = pd.read_csv(path, sep = ',', na_filter = False)
data = data.set_index('Unnamed: 0')

X_train = data.drop(columns = ['y']).copy()
y_train = data[['y']].copy()

In [3]:
path = 'continuous_test.csv'
test = pd.read_csv(path, sep = ',', na_filter = False)
test = test.set_index('Unnamed: 0')

X_test = test.drop(columns = ['y']).copy()
y_test = test[['y']].copy()

In [4]:
def calculated_metrics(variablesSelected,X_trained_scaled,y_train,X_test_scaled,y_test):
    X_trained_scaled_aux = X_trained_scaled[variablesSelected].copy()
    X_test_scaled_aux = X_test_scaled[variablesSelected].copy()
    ## Regresion para cada cantidad de variables
    regr = LinearRegression()

    # Train the model using the training sets
    regr.fit(X_trained_scaled_aux,  y_train['y'].values)

    # Make predictions using the testing set
    y_pred = regr.predict(X_test_scaled_aux)

    y_pred = pd.DataFrame(y_pred, index=y_test.index, columns=["y_pred"])

    total = y_pred.join(y_test)
    total['y_menor_-1'] = (total['y']<-1).astype(int)
    total['y_pred_menor_-1'] = (total['y_pred']<-1).astype(int)
    total['good_class'] = (total['y_menor_-1']==total['y_pred_menor_-1']).astype(int)
    mse = mean_squared_error(y_test, y_pred)
    accuray =total['good_class'].sum()/len(total)
    r2 = r2_score(y_test, y_pred)
    return mse,accuray,r2

In [5]:
modelToProof = [['x12','x23','x25','x32']]

In [6]:
modelToProof.append(['x6','x7','x12','x23','x24','x25','x32'])

In [7]:
modelToProof.append(['x6','x13','x20','x23','x25','x27','x31'])

In [8]:
modelToProof.append(['x6','x13','x21','x23','x25','x31'])

In [9]:
modelToProof.append(['x1','x23','x25','x32'])

In [10]:
modelToProof.append(['x1','x23','x25','x31'])

In [11]:
modelToProof.append(['x12','x23','x24','x25','x32'])

In [12]:
modelToProof.append(['x6','x7','x12','x23','x24','x25','x32'])

In [13]:
modelToProof.append(['x13','x23','x25','x31'])

In [14]:
modelToProof.append(['x13','x23','x25','x31','x32'])

In [15]:
modelToProof.append(['x23','x25','x31'])

In [16]:
modelToProof.append(['x13','x23','x25'])

In [17]:
modelToProof.append(['x23','x25'])

In [18]:
standarscaler = StandardScaler()
X_trained_scaled = standarscaler.fit_transform(X_train)
X_trained_scaled = pd.DataFrame(X_trained_scaled, index=X_train.index, columns=X_train.columns.values)

X_test_scaled = standarscaler.transform(X_test)

X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns.values)

In [19]:
resultados = pd.DataFrame()
for variablesSelected in modelToProof:
    mse_aux,accuray_aux,r2_aux= calculated_metrics(variablesSelected,X_trained_scaled,y_train,X_test_scaled,y_test)
    auxRes= pd.DataFrame({'Modelo': [variablesSelected], 'Median: Mean squared error': [np.median(np.asarray(mse_aux))], 'Median: R2': [np.median(np.asarray(r2_aux))],'Median: Accuray': [np.median(np.asarray(accuray_aux))]})
    resultados = pd.concat([resultados,auxRes])

In [21]:
resultados.sort_values('Median: Mean squared error')

Unnamed: 0,Modelo,Median: Mean squared error,Median: R2,Median: Accuray
0,"[x6, x13, x20, x23, x25, x27, x31]",3.605172,0.288628,0.7
0,"[x1, x23, x25, x32]",3.882074,0.23399,0.8
0,"[x1, x23, x25, x31]",3.942122,0.222142,0.8
0,"[x23, x25, x31]",4.121718,0.186704,0.8
0,"[x23, x25]",4.206218,0.17003,0.833333
0,"[x13, x23, x25]",4.331523,0.145305,0.833333
0,"[x13, x23, x25, x31, x32]",4.484211,0.115177,0.766667
0,"[x6, x13, x21, x23, x25, x31]",4.519678,0.108178,0.7
0,"[x13, x23, x25, x31]",4.520651,0.107986,0.8
0,"[x12, x23, x24, x25, x32]",4.732792,0.066127,0.733333


In [34]:
def training_model(variablesSelected,X_trained,y_train,X_test,y_test):
    X_trained_aux = X_trained[variablesSelected].copy()
    X_test_aux = X_test[variablesSelected].copy()
    ## Regresion para cada cantidad de variables
    model_to_return = {}
    model_to_return['variableSelected'] = variablesSelected
    
    pipeline = Pipeline(steps = [('standarscaled',StandardScaler()),
                                 ('linearModel',LinearRegression())])


    # Train the model using the training sets
    pipeline.fit(X_trained_aux,  y_train['y'].values)

    # Make predictions using the testing set
    y_pred = pipeline.predict(X_test_aux)

    y_pred = pd.DataFrame(y_pred, index=y_test.index, columns=["y_pred"])

    total = y_pred.join(y_test)
    total['y_menor_-1'] = (total['y']<-1).astype(int)
    total['y_pred_menor_-1'] = (total['y_pred']<-1).astype(int)
    total['good_class'] = (total['y_menor_-1']==total['y_pred_menor_-1']).astype(int)
    mse = mean_squared_error(y_test, y_pred)
    accuray =total['good_class'].sum()/len(total)
    r2 = r2_score(y_test, y_pred)
    model_to_return['model'] = pipeline
    model_to_return['mse_test'] = mse
    model_to_return['accuray_test'] = accuray
    model_to_return['r2_test'] = r2
    return model_to_return

## Modelo Final

In [29]:
variablesSelected = ['x23', 'x25']

In [35]:
model_to_return = training_model(variablesSelected,X_train,y_train,X_test,y_test)

In [36]:
model_to_return

{'variableSelected': ['x23', 'x25'], 'model': Pipeline(memory=None,
          steps=[('standarscaled',
                  StandardScaler(copy=True, with_mean=True, with_std=True)),
                 ('linearModel',
                  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                   normalize=False))],
          verbose=False), 'mse_test': 4.206217969668216, 'accuray_test': 0.8333333333333334, 'r2_test': 0.1700302684229148}

In [43]:
pickle.dump(model_to_return, open('modelContinuos.sav', 'wb'))