# Práctica 3 - Novena prueba

#### Pilar Navarro Ramírez

In [1]:
import pandas as pd

## Preprocesamiento

In [2]:
df_train=pd.read_csv("data/train.csv",na_values=[""])
df_test=pd.read_csv("data/test.csv",na_values=[""])

In [3]:
df_train.columns = [col.lower() for col in df_train]
df_test.columns = [col.lower() for col in df_test]

df_train_orig = df_train.copy()
df_test_orig = df_test.copy()

del df_train["id"]
del df_test["id"]

### Tratamos los datos perdidos

In [4]:
from sklearn import impute

In [5]:
del df_train['descuento']
del df_test['descuento']

In [6]:
df_train_replaced=df_train.copy()
imputer=impute.SimpleImputer(strategy="most_frequent")
values = imputer.fit_transform([df_train_replaced.asientos.values])
df_train_replaced.asientos.update(pd.Series(values[0]))

In [7]:
df_train_replaced=df_train_replaced.dropna()
df_train=df_train.dropna()

Separamos el atributo a predecir del resto de atributos en el conjunto de entrenamiento.

In [8]:
cols = [col for col in df_train.columns if col not in ['precio_cat']]    
df_train, df_train_obj= df_train[cols], df_train['precio_cat']
df_train_replaced, df_train_obj_replaced= df_train_replaced[cols], df_train_replaced['precio_cat']

### Normalización de los datos numéricos

In [9]:
from sklearn.preprocessing import MinMaxScaler

exc=["nombre","ciudad","combustible","tipo_marchas","mano","consumo","motor_cc","potencia",'precio_cat','id','descuento']
cols = [col for col in df_train_orig.columns if col not in exc]  


df_train_norm=df_train.copy()
df_train_norm_rpl=df_train_replaced.copy()
df_test_norm=df_test.copy()


for atributo in cols:
    data=pd.read_csv("data/"+atributo+".csv")
    data.columns = [col.lower() for col in data]
    scaler = MinMaxScaler().fit(data.values)
    train_values=df_train[atributo].values.reshape(-1,1)
    df_train_norm[atributo]=scaler.transform(train_values)
    train_values_rpl=df_train_replaced[atributo].values.reshape(-1,1)
    df_train_norm_rpl[atributo]=scaler.transform(train_values_rpl)
    test_values=df_test[atributo].values.reshape(-1,1)
    df_test_norm[atributo]=scaler.transform(test_values)
    

### 'Transformación' de variables categóricas a binarias

In [10]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

categorical=["nombre","ciudad","combustible","tipo_marchas","mano","consumo","motor_cc","potencia"]
cols = [col for col in df_train.columns if col not in categorical]    

df_train_num=df_train_norm.copy()
df_train_num_rpl=df_train_norm_rpl.copy()
df_test_num=df_test_norm.copy()

df_train_num=np.array(df_train_num[cols])
df_train_num_rpl=np.array(df_train_num_rpl[cols])
df_test_num=np.array(df_test_num[cols])

for atributo in categorical:
    data=pd.read_csv("data/"+atributo+".csv")
    data.columns = [col.lower() for col in data]
    enc = OneHotEncoder().fit(data[atributo].values.reshape(-1,1))

    #Conjunto de entrenamiento con ambos preprocesamientos
    enc_train=enc.transform(df_train[atributo].values.reshape(-1,1)).toarray()        
    df_train_num=np.hstack((df_train_num,enc_train))
    
    enc_train_rpl=enc.transform(df_train_replaced[atributo].values.reshape(-1,1)).toarray()
    df_train_num_rpl=np.hstack((df_train_num_rpl,enc_train_rpl))

    
    #Conjunto de test
    enc_test=enc.transform(df_test[atributo].values.reshape(-1,1)).toarray()
    df_test_num=np.hstack((df_test_num,enc_test))
                          
df_train_num=pd.DataFrame(df_train_num)
df_train_num_rpl=pd.DataFrame(df_train_num_rpl)
df_test_num=pd.DataFrame(df_test_num)

Eliminamos las columnas que son todo nula.

In [11]:
cols = [col for col in df_train_num.columns if df_train_num[col].max()!=0.0]
df_train_num=df_train_num[cols]
df_test_num=df_test_num[cols]
df_train_num_rpl=df_train_num_rpl[cols]

In [12]:
df_train_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4001 entries, 0 to 4000
Columns: 2387 entries, 0 to 2847
dtypes: float64(2387)
memory usage: 72.9 MB


### Oversampling

In [13]:
from imblearn.over_sampling import SMOTE
from collections import Counter
smote=SMOTE(random_state=10,sampling_strategy='minority')
df_train_over, df_train_obj_over = smote.fit_resample(df_train_num, df_train_obj)
df_train_over_rpl, df_train_obj_over_rpl = smote.fit_resample(df_train_num_rpl, df_train_obj_replaced)
Counter(df_train_obj_over)

Counter({3: 1825, 2: 502, 4: 834, 5: 637, 1: 1825})

## Aplicación de los algoritmos

In [14]:
import numpy as np

from sklearn.model_selection import cross_val_score

def cross_validation(clf,x,y,mostrar=False):
    scores=cross_val_score(clf,x,y,scoring='accuracy',cv=5)
    accuracy=np.mean(scores)  
    if mostrar:
        print("Accuracy: ", accuracy)
    return accuracy

In [15]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier


extra=ExtraTreesClassifier(random_state=10)
gradient=GradientBoostingClassifier(random_state=10,max_features='auto')
svc=LinearSVC(random_state=10)
knn=KNeighborsClassifier()
tree=DecisionTreeClassifier(random_state=10)
forest=RandomForestClassifier(random_state=10)
NN=MLPClassifier(random_state=10,max_iter=10000)

clfs=[svc,knn,tree,forest,NN,extra,gradient]
for clf in clfs:
    print(clf)
    cross_validation(clf, df_train_over, df_train_obj_over,True)

LinearSVC(random_state=10)
Accuracy:  0.8253633847370502
KNeighborsClassifier()
Accuracy:  0.7805488335310399
DecisionTreeClassifier(random_state=10)
Accuracy:  0.8258990905496244
RandomForestClassifier(random_state=10)
Accuracy:  0.8557738236457098
MLPClassifier(max_iter=10000, random_state=10)
Accuracy:  0.8445684460260973
ExtraTreesClassifier(random_state=10)
Accuracy:  0.8483058916567814
GradientBoostingClassifier(max_features='auto', random_state=10)
Accuracy:  0.8216291024120205


### Stacking

In [16]:
from sklearn.ensemble import StackingClassifier

extra=ExtraTreesClassifier(random_state=10)
gradient=GradientBoostingClassifier(random_state=10,max_features='auto')
svc=LinearSVC(random_state=10)
knn=KNeighborsClassifier()
tree=DecisionTreeClassifier(random_state=10)
forest=RandomForestClassifier(random_state=10)
NN=MLPClassifier(random_state=10,max_iter=10000)


estimators = [('Linear SVC', svc),
     ('forest', forest),('extra_trees', extra),('Red Neuronal',NN),('decision tree',tree)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5,n_jobs=6)
cross_validation(clf, df_train_over, df_train_obj_over,True)

Accuracy:  0.8753361803084223


0.8753361803084223

In [17]:
estimators = [('forest', forest),('extra_trees', extra),('Red Neuronal',NN),('decision tree',tree)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5,n_jobs=6)
cross_validation(clf, df_train_over, df_train_obj_over,True)

Accuracy:  0.8703582443653618


0.8703582443653618

In [18]:
vestimators = [ ('forest', forest),('Red Neuronal',NN),('decision tree',tree)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5,n_jobs=6)
cross_validation(clf, df_train_over, df_train_obj_over,True)

Accuracy:  0.8703582443653618


0.8703582443653618

In [19]:
estimators = [('Linear SVC', svc),('gradient',gradient),
     ('forest', forest),('extra_trees', extra),('Red Neuronal',NN),('decision tree',tree)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5,n_jobs=6)
cross_validation(clf, df_train_over, df_train_obj_over,True) 

Accuracy:  0.8787131672597864


0.8787131672597864

### Aplicación al conjunto de test

#### Entrega 13

In [20]:
extra=ExtraTreesClassifier(random_state=10)
gradient=GradientBoostingClassifier(random_state=10,max_features='auto')
svc=LinearSVC(random_state=10)
tree=DecisionTreeClassifier(random_state=10)
forest=RandomForestClassifier(random_state=10)
NN=MLPClassifier(random_state=10,max_iter=10000)


estimators = [('Linear SVC', svc),('gradient',gradient),
     ('forest', forest),('extra_trees', extra),('Red Neuronal',NN),('decision tree',tree)]

clf = StackingClassifier(estimators=estimators, final_estimator=forest,cv=5,n_jobs=6)
cross_validation(clf, df_train_over, df_train_obj_over,True) 
clf.fit(df_train_over,df_train_obj_over)
pred=clf.predict(df_test_num)
ids=df_test_orig["id"]

df_result = pd.DataFrame({'id': ids, 'Precio_cat': pred})
df_result.to_csv("resultados_13.csv", index=False)

Accuracy:  0.8788909450375643
