# MODELO DE COMPRA

## 1. Carga de datos

In [538]:
import numpy as np

In [539]:
import pandas as pd

In [540]:
data = pd.read_csv('C:\datos\DS_Compra.csv')

In [541]:
data.shape

(400, 6)

In [542]:
### obtner tipos de datos
data.info()
# otra forma
pd.DataFrame(data.dtypes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 6 columns):
ID             400 non-null int64
SEXO           400 non-null object
EDAD           387 non-null float64
INGRESOS       400 non-null int64
ESTADOCIVIL    392 non-null object
FLAG_COMPRA    400 non-null int64
dtypes: float64(1), int64(3), object(2)
memory usage: 18.8+ KB


Unnamed: 0,0
ID,int64
SEXO,object
EDAD,float64
INGRESOS,int64
ESTADOCIVIL,object
FLAG_COMPRA,int64


## 2. Analisis descriptivo de los datos

In [543]:
data.describe(include ='all')

Unnamed: 0,ID,SEXO,EDAD,INGRESOS,ESTADOCIVIL,FLAG_COMPRA
count,400.0,400,387.0,400.0,392,400.0
unique,,2,,,3,
top,,F,,,S,
freq,,204,,,214,
mean,10200.5,,37.589147,6476.5,,0.3575
std,115.614301,,10.523312,7705.031897,,0.479864
min,10001.0,,18.0,1200.0,,0.0
25%,10100.75,,29.0,3500.0,,0.0
50%,10200.5,,37.0,5800.0,,0.0
75%,10300.25,,46.0,7300.0,,1.0


In [544]:
# frecuencias de estado civil
data["ESTADOCIVIL"].value_counts()

S    214
C    125
D     53
Name: ESTADOCIVIL, dtype: int64

In [545]:
# frecuencias de estado civil en %
data["ESTADOCIVIL"].value_counts(normalize = True)

S    0.545918
C    0.318878
D    0.135204
Name: ESTADOCIVIL, dtype: float64

In [546]:
#Edad
data["EDAD"].value_counts(bins = 5)

(34.8, 43.2]                  138
(26.4, 34.8]                   76
(17.956999999999997, 26.4]     65
(43.2, 51.6]                   64
(51.6, 60.0]                   44
Name: EDAD, dtype: int64

In [547]:
# incluye nulos(dropna) / relativos(normalize) 
data["ESTADOCIVIL"].value_counts(normalize = True, dropna = False)

S      0.5350
C      0.3125
D      0.1325
NaN    0.0200
Name: ESTADOCIVIL, dtype: float64

In [548]:
data["EDAD"].value_counts(bins = 5, normalize = True)

(34.8, 43.2]                  0.3450
(26.4, 34.8]                  0.1900
(17.956999999999997, 26.4]    0.1625
(43.2, 51.6]                  0.1600
(51.6, 60.0]                  0.1100
Name: EDAD, dtype: float64

In [549]:
def genera_rangos(valor):
    if pd.isnull(valor):
        return('Nulo')
    elif valor <= 30 :
        return("[18:30]")
    elif valor <= 50 :
        return("[30-50]")
    elif valor <= 60 :
        return("[50-60]")
    else:
        return("Nan")
        

In [550]:
data["EDAD_RANGO"] = data["EDAD"].apply(lambda x: genera_rangos(x))

In [551]:
data.tail()

Unnamed: 0,ID,SEXO,EDAD,INGRESOS,ESTADOCIVIL,FLAG_COMPRA,EDAD_RANGO
395,10396,F,46.0,3400,C,1,[30-50]
396,10397,M,51.0,1900,S,1,[50-60]
397,10398,F,50.0,1600,S,1,[30-50]
398,10399,M,36.0,2700,C,0,[30-50]
399,10400,F,49.0,3000,C,1,[30-50]


In [552]:
# Frecuencias relativas
data["EDAD_RANGO"].value_counts(normalize = True)

[30-50]    0.5775
[18:30]    0.2725
[50-60]    0.1175
Nulo       0.0325
Name: EDAD_RANGO, dtype: float64

In [553]:
pd.DataFrame(data.dtypes)

Unnamed: 0,0
ID,int64
SEXO,object
EDAD,float64
INGRESOS,int64
ESTADOCIVIL,object
FLAG_COMPRA,int64
EDAD_RANGO,object


In [554]:
data.columns.values

array(['ID', 'SEXO', 'EDAD', 'INGRESOS', 'ESTADOCIVIL', 'FLAG_COMPRA',
       'EDAD_RANGO'], dtype=object)

In [555]:
data.index

RangeIndex(start=0, stop=400, step=1)

In [556]:
# seleccionar variables de tipo categoricas(object)
var_str = data.select_dtypes(include = ['object']).columns.values

In [557]:
for feature in var_str:
    print(feature)

SEXO
ESTADOCIVIL
EDAD_RANGO


In [558]:
for feature in var_str:
    print(feature)
    print(data[feature].value_counts(normalize = True, dropna = False))

SEXO
F    0.51
M    0.49
Name: SEXO, dtype: float64
ESTADOCIVIL
S      0.5350
C      0.3125
D      0.1325
NaN    0.0200
Name: ESTADOCIVIL, dtype: float64
EDAD_RANGO
[30-50]    0.5775
[18:30]    0.2725
[50-60]    0.1175
Nulo       0.0325
Name: EDAD_RANGO, dtype: float64


In [516]:
# concatenar dataframe
pd.concat([data,data], axis =1) # axis = 0 concatena filas/ 1 concatena columnas

Unnamed: 0,ID,SEXO,EDAD,INGRESOS,ESTADOCIVIL,FLAG_COMPRA,EDAD_RANGO,ID.1,SEXO.1,EDAD.1,INGRESOS.1,ESTADOCIVIL.1,FLAG_COMPRA.1,EDAD_RANGO.1
0,10001,M,19.0,1500,S,0,[18:30],10001,M,19.0,1500,S,0,[18:30]
1,10002,M,35.0,1600,C,0,[30-50],10002,M,35.0,1600,C,0,[30-50]
2,10003,F,26.0,3500,C,0,[18:30],10003,F,26.0,3500,C,0,[18:30]
3,10004,F,27.0,4700,C,0,[18:30],10004,F,27.0,4700,C,0,[18:30]
4,10005,M,19.0,6300,S,0,[18:30],10005,M,19.0,6300,S,0,[18:30]
5,10006,M,27.0,4800,C,0,[18:30],10006,M,27.0,4800,C,0,[18:30]
6,10007,F,27.0,7000,C,0,[18:30],10007,F,27.0,7000,C,0,[18:30]
7,10008,F,32.0,80000,C,1,[30-50],10008,F,32.0,80000,C,1,[30-50]
8,10009,M,25.0,2700,C,0,[18:30],10009,M,25.0,2700,C,0,[18:30]
9,10010,F,,5400,C,0,Nulo,10010,F,,5400,C,0,Nulo


In [559]:
df1=pd.DataFrame(data["ESTADOCIVIL"].value_counts())
df1=df1.rename(columns = {'ESTADOCIVIL' : "FREC_ABS"})
df1

Unnamed: 0,FREC_ABS
S,214
C,125
D,53


In [560]:

df2=pd.DataFrame(data["ESTADOCIVIL"].value_counts(normalize = True))
df2=df2.rename(columns = {'ESTADOCIVIL' : "FREC_REL"})
df2

Unnamed: 0,FREC_REL
S,0.545918
C,0.318878
D,0.135204


In [561]:
pd.concat([df1,df2], axis =1) # axis = 0 concatena filas/ 1 concatena columnas

Unnamed: 0,FREC_ABS,FREC_REL
S,214,0.545918
C,125,0.318878
D,53,0.135204


In [562]:
df3 = pd.DataFrame(data.groupby(by = ['ESTADOCIVIL'], as_index = False)['FLAG_COMPRA'].mean())
df3.head()

Unnamed: 0,ESTADOCIVIL,FLAG_COMPRA
0,C,0.248
1,D,0.245283
2,S,0.448598


In [563]:
data.pivot_table(index=['ESTADOCIVIL'],
                 columns = ['FLAG_COMPRA'],
                 values = ['ID'],
                 aggfunc = 'count')

Unnamed: 0_level_0,ID,ID
FLAG_COMPRA,0,1
ESTADOCIVIL,Unnamed: 1_level_2,Unnamed: 2_level_2
C,94,31
D,40,13
S,118,96


In [564]:
data.pivot_table(index=['ESTADOCIVIL'],
                 columns = ['FLAG_COMPRA'],
                 values = ['ID'],
                 aggfunc = 'count')

Unnamed: 0_level_0,ID,ID
FLAG_COMPRA,0,1
ESTADOCIVIL,Unnamed: 1_level_2,Unnamed: 2_level_2
C,94,31
D,40,13
S,118,96


In [565]:
data.pivot_table(index=['ESTADOCIVIL'],
                 columns = ['SEXO'],
                 values = ['FLAG_COMPRA'],
                 aggfunc = 'mean')

Unnamed: 0_level_0,FLAG_COMPRA,FLAG_COMPRA
SEXO,F,M
ESTADOCIVIL,Unnamed: 1_level_2,Unnamed: 2_level_2
C,0.258065,0.238095
D,0.275862,0.208333
S,0.481481,0.415094


## 3. PREPROCESAMIENTO DE DATOS

### 3.1 Tratamiento de valores nulos

In [566]:
# para saber cuantos nulos hay por campo
data.isnull().sum()

ID              0
SEXO            0
EDAD           13
INGRESOS        0
ESTADOCIVIL     8
FLAG_COMPRA     0
EDAD_RANGO      0
dtype: int64

### Imputar valores missing

In [567]:
data["EDAD"].fillna(37,inplace = True)

data["ESTADOCIVIL"].fillna('S',inplace = True)


In [568]:
data.isnull().sum()

ID             0
SEXO           0
EDAD           0
INGRESOS       0
ESTADOCIVIL    0
FLAG_COMPRA    0
EDAD_RANGO     0
dtype: int64

In [569]:
#tomando valores calculados para los missing
data["EDAD"].fillna(data["EDAD"].median(),inplace = True)
data["ESTADOCIVIL"].fillna(data["ESTADOCIVIL"].mode()[0],inplace = True)

### 3.2 Conversion de variables

In [570]:
    df = data.copy()

In [571]:
df.head(10)

Unnamed: 0,ID,SEXO,EDAD,INGRESOS,ESTADOCIVIL,FLAG_COMPRA,EDAD_RANGO
0,10001,M,19.0,1500,S,0,[18:30]
1,10002,M,35.0,1600,C,0,[30-50]
2,10003,F,26.0,3500,C,0,[18:30]
3,10004,F,27.0,4700,C,0,[18:30]
4,10005,M,19.0,6300,S,0,[18:30]
5,10006,M,27.0,4800,C,0,[18:30]
6,10007,F,27.0,7000,C,0,[18:30]
7,10008,F,32.0,80000,C,1,[30-50]
8,10009,M,25.0,2700,C,0,[18:30]
9,10010,F,37.0,5400,C,0,Nulo


In [572]:
# Percentiles
np.nanpercentile(df["EDAD"],[0,1,5,10,50,75,90,99,100])


array([18., 18., 21., 24., 37., 45., 52., 60., 60.])

In [573]:
np.nanpercentile(df['EDAD'],[90])[0]

52.0

In [574]:
indices_percmayor90 = df['EDAD'] > np.nanpercentile(data['EDAD'],[90])[0]

In [575]:
set(indices_percmayor90)

{False, True}

In [576]:
df.loc[indices_percmayor90, "EDAD"] = 52

In [577]:
df

Unnamed: 0,ID,SEXO,EDAD,INGRESOS,ESTADOCIVIL,FLAG_COMPRA,EDAD_RANGO
0,10001,M,19.0,1500,S,0,[18:30]
1,10002,M,35.0,1600,C,0,[30-50]
2,10003,F,26.0,3500,C,0,[18:30]
3,10004,F,27.0,4700,C,0,[18:30]
4,10005,M,19.0,6300,S,0,[18:30]
5,10006,M,27.0,4800,C,0,[18:30]
6,10007,F,27.0,7000,C,0,[18:30]
7,10008,F,32.0,80000,C,1,[30-50]
8,10009,M,25.0,2700,C,0,[18:30]
9,10010,F,37.0,5400,C,0,Nulo


In [578]:
# sexo
dicc_sexo = {'M':1,
            'F':0}
df['SEXO'] = df['SEXO'].map(dicc_sexo)

In [581]:
# ESTADO CIVIL
df_ec_dummies=pd.get_dummies(df["ESTADOCIVIL"], prefix = 'ESTADOCIVIL')


In [456]:
df

Unnamed: 0,ID,SEXO,EDAD,INGRESOS,ESTADOCIVIL,FLAG_COMPRA,EDAD_RANGO
0,10001,1,19.0,1500,S,0,[18:30]
1,10002,1,35.0,1600,C,0,[30-50]
2,10003,0,26.0,3500,C,0,[18:30]
3,10004,0,27.0,4700,C,0,[18:30]
4,10005,1,19.0,6300,S,0,[18:30]
5,10006,1,27.0,4800,C,0,[18:30]
6,10007,0,27.0,7000,C,0,[18:30]
7,10008,0,32.0,80000,C,1,[30-50]
8,10009,1,25.0,2700,C,0,[18:30]
9,10010,0,37.0,5400,C,0,Nulo


In [582]:
df=pd.concat([df,df_ec_dummies],axis = 1)

In [583]:
del df["ESTADOCIVIL"]

In [584]:
df

Unnamed: 0,ID,SEXO,EDAD,INGRESOS,FLAG_COMPRA,EDAD_RANGO,ESTADOCIVIL_C,ESTADOCIVIL_D,ESTADOCIVIL_S
0,10001,1,19.0,1500,0,[18:30],0,0,1
1,10002,1,35.0,1600,0,[30-50],1,0,0
2,10003,0,26.0,3500,0,[18:30],1,0,0
3,10004,0,27.0,4700,0,[18:30],1,0,0
4,10005,1,19.0,6300,0,[18:30],0,0,1
5,10006,1,27.0,4800,0,[18:30],1,0,0
6,10007,0,27.0,7000,0,[18:30],1,0,0
7,10008,0,32.0,80000,1,[30-50],1,0,0
8,10009,1,25.0,2700,0,[18:30],1,0,0
9,10010,0,37.0,5400,0,Nulo,1,0,0


In [585]:
# edad rando
df.EDAD_RANGO.unique()

array(['[18:30]', '[30-50]', 'Nulo', '[50-60]'], dtype=object)

In [586]:
# variable dummy
dicc_rangoedad = {'[18:30]':0,
             '[30-50]':1,
             '[50-60]':2,
            'Nulo':9}
df['EDAD_RANGO'] = df['EDAD_RANGO'].map(dicc_rangoedad)

In [466]:
df.tail()

Unnamed: 0,ID,SEXO,EDAD,INGRESOS,FLAG_COMPRA,EDAD_RANGO,ESTADOCIVIL_C,ESTADOCIVIL_D,ESTADOCIVIL_S
395,10396,0,46.0,3400,1,1,1,0,0
396,10397,1,51.0,1900,1,2,0,0,1
397,10398,0,50.0,1600,1,1,0,0,1
398,10399,1,36.0,2700,0,1,1,0,0
399,10400,0,49.0,3000,1,1,1,0,0


In [587]:
del df["ESTADOCIVIL_C"]

In [472]:
df.tail()

Unnamed: 0,ID,SEXO,EDAD,INGRESOS,FLAG_COMPRA,EDAD_RANGO,ESTADOCIVIL_C,ESTADOCIVIL_S
395,10396,0,46.0,3400,1,1,1,0
396,10397,1,51.0,1900,1,2,0,1
397,10398,0,50.0,1600,1,1,0,1
398,10399,1,36.0,2700,0,1,1,0
399,10400,0,49.0,3000,1,1,1,0


### 3.3 Preseleccion de variable 

In [588]:
list(df.columns)

['ID',
 'SEXO',
 'EDAD',
 'INGRESOS',
 'FLAG_COMPRA',
 'EDAD_RANGO',
 'ESTADOCIVIL_D',
 'ESTADOCIVIL_S']

In [650]:
features_to_model =[ 'SEXO',
 'EDAD',
 'INGRESOS',
 'EDAD_RANGO',
 'ESTADOCIVIL_D',
 'ESTADOCIVIL_S']

In [651]:
X_data = df[features_to_model]

In [652]:
X_data.head()

Unnamed: 0,SEXO,EDAD,INGRESOS,EDAD_RANGO,ESTADOCIVIL_D,ESTADOCIVIL_S
0,1,19.0,1500,0,0,1
1,1,35.0,1600,1,0,0
2,0,26.0,3500,0,0,0
3,0,27.0,4700,0,0,0
4,1,19.0,6300,0,0,1


In [653]:
y_data = df['FLAG_COMPRA']

In [654]:
#separar daa de training y data de prueba
#from sklearn import cross_validation
from sklearn.cross_validation import train_test_split

In [655]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_data,y_data,
                                                                     train_size= 0.60 ,
                                                                     random_state = 20)

In [656]:
y_test

10     0
382    1
374    0
196    0
47     0
237    0
175    0
347    1
115    0
308    1
235    1
13     0
211    1
205    0
102    0
192    0
34     0
17     1
56     0
247    1
9      0
81     0
166    0
92     0
359    0
7      1
338    0
98     0
383    1
352    1
      ..
204    1
122    0
344    1
396    1
216    0
323    1
337    0
200    0
77     0
300    1
93     0
119    0
132    0
103    1
362    1
325    0
23     1
112    0
318    1
38     0
120    0
85     1
395    1
145    0
348    0
183    0
15     0
209    0
5      0
124    0
Name: FLAG_COMPRA, Length: 160, dtype: int64

# 4. Modelamiento

## 4.1 Arbol de desicion

In [657]:
from sklearn.tree import DecisionTreeClassifier

In [658]:
#declarar el modelo a usar
model = DecisionTreeClassifier(max_depth=5)

In [659]:
#entrenar  (ajustar) el modelo
model = model.fit(X_train , y_train)

In [660]:
X_train.head()

Unnamed: 0,SEXO,EDAD,INGRESOS,EDAD_RANGO,ESTADOCIVIL_D,ESTADOCIVIL_S
220,0,41.0,6600,1,1,0
39,0,27.0,2500,0,0,0
58,1,22.0,1500,0,0,1
48,1,30.0,11200,0,0,0
301,1,48.0,6100,1,0,1


In [661]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

### 4.1.2 Evaluacion de Modelo

In [662]:
from sklearn.metrics  import confusion_matrix
from sklearn.metrics  import accuracy_score

In [663]:
confusion_matrix(y_train,pred_train)

array([[157,   3],
       [  6,  74]], dtype=int64)

In [664]:
accuracy_train = accuracy_score(y_train,pred_train)
print("el accuracy del modelo en data de training es: " , accuracy_train)

el accuracy del modelo en data de training es:  0.9625


In [665]:
confusion_matrix(y_test,pred_test)

array([[95,  2],
       [13, 50]], dtype=int64)

In [666]:
accuracy_train = accuracy_score(y_test,pred_test)
print("el accuracy del modelo en data de testing es: " , accuracy_train)

el accuracy del modelo en data de testing es:  0.90625


### 4.1.2 Random Forest

In [669]:
from sklearn.ensemble import RandomForestClassifier

In [712]:
#declarar el modelo a usar
model = RandomForestClassifier(n_estimators = 100 , max_depth = 3 , random_state = 20, n_jobs = -1,
                              max_features = 4)

In [713]:
#entrenar  (ajustar) el modelo
model = model.fit(X_train , y_train)

In [714]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [715]:
from sklearn.metrics  import confusion_matrix
from sklearn.metrics  import accuracy_score

In [716]:
confusion_matrix(y_train,pred_train)

array([[149,  11],
       [  6,  74]], dtype=int64)

In [717]:
accuracy_train = accuracy_score(y_train,pred_train)
print("el accuracy del modelo en data de training es: " , accuracy_train)

el accuracy del modelo en data de training es:  0.9291666666666667


In [718]:
confusion_matrix(y_test,pred_test)

array([[94,  3],
       [14, 49]], dtype=int64)

In [719]:
accuracy_train = accuracy_score(y_test,pred_test)
print("el accuracy del modelo en data de testing es: " , accuracy_train)

el accuracy del modelo en data de testing es:  0.89375


In [723]:
df_e=pd.DataFrame(pred_test)

In [724]:
df_e.index

RangeIndex(start=0, stop=160, step=1)

In [725]:
df_e

Unnamed: 0,0
0,0
1,1
2,0
3,0
4,0
5,0
6,0
7,1
8,0
9,1
