In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, accuracy_score
import xgboost as xgb
import pickle

In [74]:
path = 'count_train.csv'
data = pd.read_csv(path, sep = ',', na_filter = False)

X_train = data.drop(columns = ['yC']).copy()
y_train = data[['yC']].copy()

In [75]:
path = 'count_test.csv'
test = pd.read_csv(path, sep = ',', na_filter = False)

X_test = test.drop(columns = ['yC']).copy()
y_test = test[['yC']].copy()

In [96]:
def calculated_metrics(variablesSelected,X_trained_scaled,y_train,X_test_scaled,y_test):
    X_trained_scaled_aux = X_trained_scaled[variablesSelected].copy()
    X_test_scaled_aux = X_test_scaled[variablesSelected].copy()
    regr = xgb.XGBRegressor(objective ='reg:squarederror')

    # Train the model using the training sets
    regr.fit(X_trained_scaled_aux,  y_train['yC'].values)

    # Make predictions using the testing set
    y_pred = regr.predict(X_test_scaled_aux)

    y_pred = pd.DataFrame(y_pred, index=y_test.index, columns=["y_pred"])
    y_pred = np.round(y_pred)

    total = y_pred.join(y_test)
    total['y_igual_0'] = (total['yC'] == 0).astype(int)
    total['y_pred_igual_0'] = (total['y_pred'] == 0).astype(int)
    total['good_class'] = (total['y_igual_0']==total['y_pred_igual_0']).astype(int)
    mse = mean_squared_error(y_test, y_pred)
    accuray = accuracy_score(total['y_igual_0'], total['y_pred_igual_0']) 
    return mse,accuray

In [77]:
modelToProof = [['x4', 'x13', 'x25', 'x24', 'x27', 'x7', 'x14', 'x3', 'x2', 'x8', 'x22']]

In [78]:
standarscaler = StandardScaler()
X_trained_scaled = standarscaler.fit_transform(X_train)
X_trained_scaled = pd.DataFrame(X_trained_scaled, index=X_train.index, columns=X_train.columns.values)

X_test_scaled = standarscaler.transform(X_test)

X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns.values)

In [95]:
resultados = pd.DataFrame()
for variablesSelected in modelToProof:
    mse_aux,accuray_aux= calculated_metrics(variablesSelected,X_trained_scaled,y_train,X_test_scaled,y_test)
    auxRes= pd.DataFrame({'Modelo': [variablesSelected], 'Median: Mean squared error': mse_aux, 'Median: Accuray': accuray_aux})
    resultados = pd.concat([resultados,auxRes])

0.6
0.6


In [80]:
resultados.sort_values('Median: Mean squared error')

Unnamed: 0,Modelo,Median: Mean squared error,Median: Accuray
0,"[x4, x13, x25, x24, x27, x7, x14, x3, x2, x8, ...",1.066667,0.3


In [81]:
def training_model(variablesSelected,X_trained,y_train,X_test,y_test):
    X_trained_aux = X_trained[variablesSelected].copy()
    X_test_aux = X_test[variablesSelected].copy()
    ## Regresion para cada cantidad de variables
    model_to_return = {}
    model_to_return['variableSelected'] = variablesSelected
    
    pipeline = Pipeline(steps = [('standarscaled',StandardScaler()),
                                 ('xgboost',xgb.XGBRegressor(objective ='reg:squarederror'))])


    # Train the model using the training sets
    pipeline.fit(X_trained_aux,  y_train['yC'].values)

    # Make predictions using the testing set
    y_pred = pipeline.predict(X_test_aux)

    y_pred = pd.DataFrame(y_pred, index=y_test.index, columns=["y_pred"])
    y_pred = np.round(y_pred)

    total = y_pred.join(y_test)
    total['y_igual_0'] = (total['yC'] == 0).astype(int)
    total['y_pred_igual_0'] = (total['y_pred'] == 0).astype(int)
    total['good_class'] = (total['y_igual_0']==total['y_pred_igual_0']).astype(int)
    mse = mean_squared_error(y_test, y_pred)
    accuray = accuracy_score(y_test, y_pred) #total['good_class'].sum()/len(total)
    model_to_return['model'] = pipeline
    model_to_return['mse_test'] = mse
    model_to_return['accuray_test'] = accuray
    return model_to_return

## Modelo Final

In [82]:
variablesSelected = ['x4', 'x13', 'x25', 'x24', 'x27', 'x7', 'x14', 'x3', 'x2', 'x8', 'x22']

In [83]:
model_to_return = training_model(variablesSelected,X_train,y_train,X_test,y_test)

In [84]:
model_to_return

{'variableSelected': ['x4',
  'x13',
  'x25',
  'x24',
  'x27',
  'x7',
  'x14',
  'x3',
  'x2',
  'x8',
  'x22'],
 'model': Pipeline(memory=None,
          steps=[('standarscaled',
                  StandardScaler(copy=True, with_mean=True, with_std=True)),
                 ('xgboost',
                  XGBRegressor(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0,
                               importance_type='gain', learning_rate=0.1,
                               max_delta_step=0, max_depth=3, min_child_weight=1,
                               missing=None, n_estimators=100, n_jobs=1,
                               nthread=None, objective='reg:squarederror',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, seed=None, silent=None,
                               subsample=1, verbosity=1))],
  

In [85]:
pickle.dump(model_to_return, open('modelCount.sav', 'wb'))