In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import warnings
import time

import Featurizer as ft

warnings.filterwarnings('ignore')

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')
%config InlineBackend.figure_format = 'png' #set 'png' here when working on notebook
pd.set_option('display.max_columns', 50)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
plt.style.use('default')
plt.rcParams['figure.figsize'] = [12.0,8.0]

In [None]:
df_train = pd.read_csv("../data/palabras.csv")

## Pre-processing

In [None]:
X = df_train.drop("precio", axis=1)
y = df_train['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train_submit = X_train.copy()

In [None]:
import category_encoders as ce

OHE = ce.OneHotEncoder( handle_unknown='ignore', use_cat_names=True)
BE = ce.BinaryEncoder( handle_unknown='ignore')

In [None]:
X_train = ft.preprocess(X_train, OHE, BE, 'train')
X_test = ft.preprocess(X_test, OHE, BE, 'test')

In [None]:
print(f"Original shapes: X={X.shape} y={y.shape}")
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Test  shapes: X={X_test.shape}  y={y_test.shape}")

## Feature Selection

In [None]:
#from sklearn.feature_selection import SelectKBest

In [None]:
#selection = SelectKBest(k=40)
#x_features = selection.fit_transform(X_train, y_train)
#columns = np.asarray(X_train.columns.values)
#support = np.asarray(selection.get_support())
#columns_with_support = columns[support]

In [None]:
#columns_with_support

## Training

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=100,n_jobs=-1,objective="reg:squarederror",max_depth= 9, min_child_weight= 5,eval_metric="rmse")


In [None]:
start_time = time.time()
model.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

## Predicciones

In [None]:
ft.predecir(model, X_train, y_train, X_test, y_test)

In [None]:
plt.rcParams["figure.figsize"] = [15, 15]

importancia = model.feature_importances_

d = {'x':X_train.columns,'y':importancia}

df_importancia = pd.DataFrame(d)

In [None]:
importancia = df_importancia.sort_values(by="y",ascending=True).head(40)
ax = plt.barh(importancia.x,importancia.y)

 ## Tunning

In [None]:
from sklearn.model_selection import GridSearchCV
param_test = {
 'n_estimators':range(100,1001,50)
}
search = GridSearchCV(
        estimator = XGBRegressor(max_depth= 14,min_child_weight = 10,objective="reg:squarederror"), 
        param_grid = param_test,
        n_jobs=3,
        cv=3,
        verbose=10)


In [None]:
search.fit(X_train,y_train)

search.best_params_, gsearch1.best_score_

In [None]:
pd.DataFrame(search.cv_results_)

In [None]:
ft.predecir(search,X_train,y_train,X_test,y_test)

## Submission

In [None]:
df_test = pd.read_csv('../data/test.csv')

In [None]:
df = df_test.copy()
df = ft.init_test(df)

In [None]:
df = ft.preprocess(df, OHE, BE, 'test')

In [None]:
submit_prediction = gsearch1.predict( df )

In [None]:
#cols_diff = []
#ft = X_train.columns.values
#fs = df.columns.values
#for feat in fs:
#    if feat not in ft:
#        cols_diff.append(feat)
        
#display(cols_diff)        

In [None]:
#pd.concat([df_test[['id']], pd.Series(submit_prediction)], axis=1).rename(columns={0:"target"}).to_csv("../data/submission13.csv", index=False)

In [None]:
from submission_helper import submission_output

output = submission_output(df_test, submit_prediction)

nombre_submit = 'submission15.csv'

output.to_csv('../data/'+nombre_submit, index=False)