In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
import math as mt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
import joblib

## Cargue de datos 

In [51]:
train = pd.read_csv("s3://claseelectiva2mao/datos/train")
test = pd.read_csv("s3://claseelectiva2mao/datos/test")

## Creacion del pipeline

In [52]:
numeric_pipeline = Pipeline(
    [
        ('imputacion con la media', SimpleImputer(strategy='mean')),
        ('Escalado minmax', MinMaxScaler())
    ]
)

In [53]:
category_pipeline = Pipeline(
    [
        ('imputacion con la moda', SimpleImputer(strategy='most_frequent')),
        ('Codificar', OneHotEncoder(sparse=False))
    ]
)

In [54]:
column_transformer = ColumnTransformer(
    [
        ('numeric_pipeline', numeric_pipeline, [0, 1, 2]),
        ('category_pipeline', category_pipeline, [3,4,5])
    ]

)

In [55]:
features = ["Age", "Credit amount", "Duration", "Sex", "Purpose", "Housing"]

## Seleccion de caracteristicas

In [56]:
X_train = train.drop(["Risk", "Unnamed: 0"], axis=1)

In [57]:
y_train = train['Risk']

In [58]:
X_train = X_train.dropna()

In [59]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 361 entries, 0 to 699
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               361 non-null    int64 
 1   Sex               361 non-null    object
 2   Job               361 non-null    int64 
 3   Housing           361 non-null    object
 4   Saving accounts   361 non-null    object
 5   Checking account  361 non-null    object
 6   Credit amount     361 non-null    int64 
 7   Duration          361 non-null    int64 
 8   Purpose           361 non-null    object
dtypes: int64(4), object(5)
memory usage: 28.2+ KB


In [60]:
X_train = pd.get_dummies(X_train)

In [61]:
X_train

Unnamed: 0,Age,Job,Credit amount,Duration,Sex_female,Sex_male,Housing_free,Housing_own,Housing_rent,Saving accounts_little,...,Checking account_moderate,Checking account_rich,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,31,2,4473,36,0,1,0,1,0,1,...,0,1,0,0,0,0,0,1,0,0
5,36,1,4241,24,0,1,0,1,0,1,...,1,0,1,0,0,0,0,0,0,0
6,35,3,6948,36,0,1,0,0,1,1,...,1,0,0,1,0,0,0,0,0,0
7,24,2,2145,36,0,1,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
10,30,2,639,12,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692,26,2,4788,48,0,1,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
693,36,2,1275,24,0,1,0,1,0,0,...,0,1,1,0,0,0,0,0,0,0
694,37,1,1274,12,1,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
695,41,1,5954,42,1,0,0,1,0,1,...,1,0,1,0,0,0,0,0,0,0
