# Práctica 3 - Sexta prueba

#### Pilar Navarro Ramírez

In [1]:
import pandas as pd

## Preprocesamiento

In [2]:
df_train=pd.read_csv("data/train.csv",na_values=[""])
df_test=pd.read_csv("data/test.csv",na_values=[""])

In [3]:
df_train.columns = [col.lower() for col in df_train]
df_test.columns = [col.lower() for col in df_test]

df_train_orig = df_train.copy()
df_test_orig = df_test.copy()

del df_train["id"]
del df_test["id"]

### Tratamos los datos perdidos

In [4]:
from sklearn import impute

In [5]:
del df_train['descuento']
del df_test['descuento']

In [6]:
df_train_replaced=df_train.copy()
imputer=impute.SimpleImputer(strategy="most_frequent")
values = imputer.fit_transform([df_train_replaced.asientos.values])
df_train_replaced.asientos.update(pd.Series(values[0]))

In [7]:
df_train_replaced=df_train_replaced.dropna()
df_train=df_train.dropna()

Separamos el atributo a predecir del resto de atributos en el conjunto de entrenamiento.

In [8]:
cols = [col for col in df_train.columns if col not in ['precio_cat']]    
df_train, df_train_obj= df_train[cols], df_train['precio_cat']
df_train_replaced, df_train_obj_replaced= df_train_replaced[cols], df_train_replaced['precio_cat']

### 'Transformación' de variables categóricas a numéricas

In [9]:
from sklearn.preprocessing import LabelEncoder

categorical=["nombre","ciudad","combustible","tipo_marchas","mano","consumo","motor_cc","potencia"]

df_train_num=df_train.copy()
df_train_num_rpl=df_train_replaced.copy()
df_test_num=df_test.copy()

for atributo in categorical:
    data=pd.read_csv("data/"+atributo+".csv")
    data.columns = [col.lower() for col in data]
    label = LabelEncoder().fit(data[atributo])
    df_train_num[atributo]=label.transform(df_train[atributo])
    df_train_num_rpl[atributo]=label.transform(df_train_replaced[atributo])
    df_test_num[atributo]=label.transform(df_test[atributo])

### Normalización de los datos

In [10]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

cols = [col for col in df_train_orig.columns if col not in ['precio_cat','id','descuento']]  
categorical=["nombre","ciudad","combustible","tipo_marchas","mano","consumo","motor_cc","potencia"]

df_train_norm=df_train_num.copy()
df_train_norm_rpl=df_train_num_rpl.copy()
df_test_norm=df_test_num.copy()


for atributo in cols:
    data=pd.read_csv("data/"+atributo+".csv")
    data.columns = [col.lower() for col in data]
    if atributo in categorical:
        label = LabelEncoder().fit(data[atributo])
        data[atributo]=label.transform(data[atributo])
    scaler = MinMaxScaler().fit(data.values)
    train_values=df_train_num[atributo].values.reshape(-1,1)
    df_train_norm[atributo]=scaler.transform(train_values)
    train_values_rpl=df_train_num_rpl[atributo].values.reshape(-1,1)
    df_train_norm_rpl[atributo]=scaler.transform(train_values_rpl)
    test_values=df_test_num[atributo].values.reshape(-1,1)
    df_test_norm[atributo]=scaler.transform(test_values)
    

## Aplicación de los algoritmos

In [11]:
import numpy as np

from sklearn.model_selection import cross_val_score

def cross_validation(clf,x,y,mostrar=False):
    scores=cross_val_score(clf,x,y,scoring='accuracy',cv=5)
    accuracy=np.mean(scores)  
    if mostrar:
        print("Accuracy: ", accuracy)
    return accuracy

In [29]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,BaggingClassifier,GradientBoostingClassifier

knn=KNeighborsClassifier()
bagging_knn=BaggingClassifier(knn,n_estimators=22, max_samples=1.0, max_features=0.6, random_state=10)
forest=RandomForestClassifier(random_state=10)
extra=ExtraTreesClassifier(n_estimators=275,random_state=10,bootstrap=True, max_samples=0.9)
gradient=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=10,max_features='auto',subsample=0.8,max_depth=5)

clfs=[gradient,bagging_knn,forest,extra]

for clf in clfs:
    print("Datos normalizados con todos nulos eliminados: ")
    print(clf)
    cross_validation(clf, df_train_norm, df_train_obj,True)
    print("Datos normalizados con nulos en asientos reemplazados: ")
    print(clf)
    cross_validation(clf,df_train_norm_rpl,df_train_obj_replaced,True)

Datos normalizados con todos nulos eliminados: 
GradientBoostingClassifier(max_depth=5, max_features='auto', random_state=10,
                           subsample=0.8)
Accuracy:  0.8360374531835205
Datos normalizados con nulos en asientos reemplazados: 
GradientBoostingClassifier(max_depth=5, max_features='auto', random_state=10,
                           subsample=0.8)
Accuracy:  0.8328797993111952
Datos normalizados con todos nulos eliminados: 
BaggingClassifier(base_estimator=KNeighborsClassifier(), max_features=0.6,
                  n_estimators=22, random_state=10)
Accuracy:  0.8102943196004995
Datos normalizados con nulos en asientos reemplazados: 
BaggingClassifier(base_estimator=KNeighborsClassifier(), max_features=0.6,
                  n_estimators=22, random_state=10)
Accuracy:  0.8035494098995949
Datos normalizados con todos nulos eliminados: 
RandomForestClassifier(random_state=10)
Accuracy:  0.822791822721598
Datos normalizados con nulos en asientos reemplazados: 
Rando

In [13]:
from sklearn.ensemble import StackingClassifier

knn=KNeighborsClassifier()
bagging_knn=BaggingClassifier(knn,n_estimators=22, max_samples=1.0, max_features=0.6, random_state=10)
forest=RandomForestClassifier(random_state=10)
extra=ExtraTreesClassifier(n_estimators=275,random_state=10,bootstrap=True, max_samples=0.9)
gradient=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=10,max_features='auto',subsample=0.8,max_depth=5)

estimators = [('bagging_knn', bagging_knn),
     ('forest', forest),('extra_trees', extra)]

clf = StackingClassifier(estimators=estimators, final_estimator=gradient,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8055483770287142


0.8055483770287142

In [14]:
estimators = [('forest', forest),('extra_trees', extra)]

clf = StackingClassifier(estimators=estimators, final_estimator=gradient,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8005493133583019


0.8005493133583019

In [15]:
estimators = [('forest', forest),('extra_trees', extra),('gradient',gradient)]

clf = StackingClassifier(estimators=estimators, final_estimator=gradient,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8227911985018727


0.8227911985018727

In [16]:
estimators = [('forest', forest),('extra_trees', extra),('gradient',gradient)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8317887016229714


0.8317887016229714

In [18]:
capa1_estimadores=[('forest', forest),('gradient',gradient)]
capa2_estimadores=[('extra_trees', extra),('gradient',gradient)]
capa2=StackingClassifier(estimators=capa2_estimadores, final_estimator=forest,cv=5)
clf = StackingClassifier(estimators=capa1_estimadores, final_estimator=capa2,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.819291822721598


0.819291822721598

In [19]:
estimators = [('bagging_knn', bagging_knn),('forest', forest),('extra_trees', extra),('gradient',gradient)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8322921348314607


0.8322921348314607

In [20]:
capa1_estimadores=[('forest', forest),('gradient',gradient)]
capa2_estimadores=[('extra_trees', extra),('gradient',gradient)]
capa2=StackingClassifier(estimators=capa2_estimadores, final_estimator=gradient,cv=5)
clf = StackingClassifier(estimators=capa1_estimadores, final_estimator=capa2,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8100415106117353


0.8100415106117353

In [21]:
capa1_estimadores=[('forest', forest),('extra_trees', extra),('gradient',gradient),('bagging_knn', bagging_knn)]
capa2_estimadores=[('forest', forest),('gradient',gradient)]
capa2=StackingClassifier(estimators=capa2_estimadores, final_estimator=forest,cv=5)
clf = StackingClassifier(estimators=capa1_estimadores, final_estimator=capa2,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8197946317103622


0.8197946317103622

In [23]:
capa1_estimadores=[('forest', forest),('extra_trees', extra),('gradient',gradient),('bagging_knn', bagging_knn)]
capa2_estimadores=[('forest', forest),('extra_trees', extra),('gradient',gradient),('bagging_knn', bagging_knn)]
capa2=StackingClassifier(estimators=capa2_estimadores, final_estimator=forest,cv=5)
clf  = StackingClassifier(estimators=capa1_estimadores, final_estimator=capa2,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8237924469413234


0.8237924469413234

In [24]:
from sklearn.neural_network import MLPClassifier
NN=MLPClassifier(hidden_layer_sizes=(300,200,100),random_state=10,max_iter=1000,alpha=0.0015)

estimators = [('red neuronal',NN),('bagging_knn', bagging_knn),('forest', forest),('extra_trees', extra),('gradient',gradient)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8350374531835205


0.8350374531835205

In [25]:
estimators = [('red neuronal',NN),('bagging_knn', bagging_knn),('forest', forest),('extra_trees', extra),('gradient',gradient)]

clf = StackingClassifier(estimators=estimators, final_estimator=NN,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8295455680399499


0.8295455680399499

In [26]:
estimators = [('red neuronal',NN),('forest', forest),('extra_trees', extra),('gradient',gradient)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8327877652933833


0.8327877652933833

In [30]:
capa1_estimadores=[('forest', forest),('extra_trees', extra),('gradient',gradient),('bagging_knn', bagging_knn)]
capa2_estimadores=[('forest', forest),('gradient',gradient),('red neuronal',NN)]
capa2=StackingClassifier(estimators=capa2_estimadores, final_estimator=forest,cv=5)
clf  = StackingClassifier(estimators=capa1_estimadores, final_estimator=capa2,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8230411985018726


0.8230411985018726

In [31]:
estimators = [('red neuronal',NN),('forest', forest),('gradient',gradient)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5)
cross_validation(clf, df_train_norm, df_train_obj,True)

Accuracy:  0.8277887016229712


0.8277887016229712

### Aplicación al conjunto de test

In [33]:
knn=KNeighborsClassifier()
bagging_knn=BaggingClassifier(knn,n_estimators=22, max_samples=1.0, max_features=0.6, random_state=10)
forest=RandomForestClassifier(random_state=10)
extra=ExtraTreesClassifier(n_estimators=275,random_state=10,bootstrap=True, max_samples=0.9)
gradient=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=10,max_features='auto',subsample=0.8,max_depth=5)
NN=MLPClassifier(hidden_layer_sizes=(300,200,100),random_state=10,max_iter=1000,alpha=0.0015)

estimators = [('red neuronal',NN),('bagging_knn', bagging_knn),('forest', forest),('extra_trees', extra),('gradient',gradient)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5)

clf.fit(df_train_norm,df_train_obj)
pred=clf.predict(df_test_norm)
ids=df_test_orig["id"]

df_result = pd.DataFrame({'id': ids, 'Precio_cat': pred})
df_result.to_csv("resultados_7.csv", index=False)

In [37]:
df_result.head()

Unnamed: 0,id,Precio_cat
0,4820,5
1,4821,2
2,4822,2
3,4823,3
4,4824,2
