In [21]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import plot_partial_dependence, permutation_importance
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [22]:
df = pd.read_csv('survey_data_clean.csv', index_col=0)

In [23]:
df

Unnamed: 0,age,gender,time,type_route,cost,difficulty,company,transport,routes,route_id
0,18-35,Hombre,3 - 4 horas,Literaria,1-50 Euros,Alta,Solo,Pie,La barraca,11
1,35-45,Hombre,1 - 2 horas,Historica,Gratis,Baja,Pareja,Pie,Ruta de la Seda,13
2,18-35,Hombre,3 - 4 horas,Historica,Gratis,Baja,Solo,Bicicleta,Arbres monumentals i singulars,0
3,18-35,Hombre,3 - 4 horas,Patrimonio,Gratis,Baja,Pareja,Pie,Sant Agustí Pont de Fusta,18
4,18-35,Mujer,3 - 4 horas,Turistica,Gratis,Baja,Pareja,Bicicleta,Ruta Cultural Anell Ciclista,12
...,...,...,...,...,...,...,...,...,...,...
66,18-35,Mujer,1 - 2 horas,Turistica,Gratis,Baja,Pareja,Pie,Ruta Valencia en la Memòria,23
67,18-35,Mujer,1 - 2 horas,Turistica,Gratis,Baja,Pareja,Pie,Arbres: Un passeig per l’Albereda,3
68,35-45,Hombre,1 - 2 horas,Turistica,Gratis,Alta,Amigos,Bicicleta,Ruta València en bicicleta,17
69,35-45,Mujer,3 - 4 horas,Historica,1-50 Euros,Baja,Familia,Bicicleta,Arbres monumentals i singulars,0


In [24]:
df['age'] = SimpleImputer(strategy='mode')

In [25]:
df = pd.get_dummies(df, prefix=['age', 'gender', 'type', 'diff', 'comp', 'trans'], columns=['age', 'gender', 'type_route', 'difficulty','company', 'transport'])

In [26]:
def mapping(x):
    if x == 'Gratis':
        return 1
    elif x == '1-50 Euros':
        return 2
    elif x == '+50 Euros':
        return 3
    else:
        return 9999

df['cost'] = df['cost'].apply(mapping)

In [30]:
df['time'] =  df.time.str.replace(' ','').str.replace('horas','').str.split('-')

In [32]:
df['time'] = df.time.map(lambda x: ( ( int(x[0]) + int(x[1]) )/ 2))


In [33]:
df

Unnamed: 0,time,cost,routes,route_id,age_SimpleImputer(strategy='mode'),gender_Hombre,gender_Mujer,gender_Prefiero no decirlo,type_Historica,type_Literaria,...,type_Turistica,diff_Alta,diff_Baja,comp_Amigos,comp_Familia,comp_Pareja,comp_Solo,trans_ Pie,trans_Bicicleta,trans_Pie
0,3.5,2,La barraca,11,1,1,0,0,0,1,...,0,1,0,0,0,0,1,0,0,1
1,1.5,1,Ruta de la Seda,13,1,1,0,0,1,0,...,0,0,1,0,0,1,0,0,0,1
2,3.5,1,Arbres monumentals i singulars,0,1,1,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0
3,3.5,1,Sant Agustí Pont de Fusta,18,1,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1
4,3.5,1,Ruta Cultural Anell Ciclista,12,1,0,1,0,0,0,...,1,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,1.5,1,Ruta Valencia en la Memòria,23,1,0,1,0,0,0,...,1,0,1,0,0,1,0,1,0,0
67,1.5,1,Arbres: Un passeig per l’Albereda,3,1,0,1,0,0,0,...,1,0,1,0,0,1,0,1,0,0
68,1.5,1,Ruta València en bicicleta,17,1,1,0,0,0,0,...,1,1,0,1,0,0,0,0,1,0
69,3.5,2,Arbres monumentals i singulars,0,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,1,0


In [34]:
df.to_csv('isma_synthetic_data.csv')

In [43]:
#df = df.drop('routes', axis=1)

# DATA PREPARATION

In [44]:
X = df.drop(columns=['route_id'], axis=1)
y = df[['route_id']]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
rnd_clf = RandomForestClassifier(n_estimators=500,
                                 max_leaf_nodes=16,
                                 random_state=42)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

  rnd_clf.fit(X_train, y_train)


In [47]:
accuracy_score(y_test, y_pred_rf)

0.0

In [48]:
confusion_matrix(y_test, y_pred_rf)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
       [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)