In [1]:
# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
# vamos a hacer nuestro propio split para tener la Y en el test set y poder obtener el score sobre el test set, ademas de poder comprobar si nos estaria haciendo overfitting por ejemplo
df    = pd.read_csv("../data/data.csv")
titanic_df, test_df = train_test_split(df, test_size=0.2, random_state=50,stratify=df['Survived'])

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# primero eliminamos todas las columnas que no queremos
class eliminaColumnas(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_remove=None):
        self.cols_to_remove = cols_to_remove
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        _X = X.copy()
        _X.drop(self.cols_to_remove, axis=1,inplace=True)
        
        return _X

    
class customPipeline(BaseEstimator, TransformerMixin):
    
    def __init(self):
        self.train_cols_dummies = None
        self.target_col = "Survived"

    def fit(self, X, y=None):
        # solo se llama para el train
        
        # entonces hacemos el dummies aqui que seria para el train
        print('<fit>------> dummies')
        X = pd.get_dummies(X)
        self.train_cols_dummies = X.columns
        
        # guardamos las columnas a la nube
        print('<fit>------> Saving encoded columns')
        #cos.save_object_in_cos(X.columns, 'encoded_columns', timestamp)
        
        return self
        
    
    def transform(self, X, y=None):

        if self.train_cols_dummies is not None:
            # no estamos en train
            # hacemos dummies y comparamos las columnas con las de train
            print('<transform>------> test dummies')
            X = pd.get_dummies(X)
            
            # mismas cols que en train
            print('<transform>------> test igualamos columnas')
            X = X.reindex(labels = self.train_cols_dummies, axis = 1, fill_value = 0)    

        # creación de variable Child de tipo booleana
        print('------> Creating child')
        X['Child'] = 0
        X.loc[X.Age < 16, 'Child'] = 1
        
        return X
    
        
        

In [4]:
# define training and testing sets

X_train = titanic_df.drop("Survived",axis=1)
Y_train = titanic_df["Survived"]

X_test  = test_df.drop(["Survived"],axis=1).copy()
Y_test = test_df["Survived"]

In [12]:
# Random Forests
rfPipeline = Pipeline(
    steps=[
        ("eliminaColumnas",eliminaColumnas(['PassengerId','Name','Ticket'])),
        ("customPipeline",customPipeline()),
        ("simpleimputer",SimpleImputer(strategy = 'median', fill_value = 0)),
        ("model",RandomForestClassifier())
    ]
)


rfPipeline.fit(X_train, Y_train)

random_forest_score = rfPipeline.score(X_test,Y_test)
print('Test set score: ' + str(random_forest_score))


nuevo_df =  pd.read_csv("/home/natxo-casa/Desktop/teno/prueba.csv")
print(rfPipeline.predict(nuevo_df))


print(rfPipeline.named_steps["model"].feature_importances_)

'''
#cv params
parameters = { 
    'model__n_estimators': [200, 500],
    'model__max_features': ['auto', 'sqrt', 'log2'],
    'model__max_depth' : [4,5,6,7,8],
    'model__criterion' :['gini', 'entropy']
}

rf_version = GridSearchCV(rfPipeline, parameters, n_jobs=-1)
rf_version.fit(X_train, Y_train)
random_forest_score = rf_version.score(X_test,Y_test)

print('Training set score: ' + str(rf_version.score(X_train,Y_train)))
print('Test set score: ' + str(random_forest_score))
print('GridSearchCV params: ', rf_version.best_params_)
'''

<fit>------> dummies
<fit>------> Saving encoded columns
<transform>------> test dummies
<transform>------> test igualamos columnas
------> Creating child
<transform>------> test dummies
<transform>------> test igualamos columnas
------> Creating child
Test set score: 0.8659217877094972
<transform>------> test dummies
<transform>------> test igualamos columnas
------> Creating child
[0]


TypeError: list indices must be integers or slices, not str