In [22]:
import pandas as pd
import numpy as np
import seaborn as sns

In [23]:
df = pd.read_csv('../data/synthetic_data.csv', index_col=0)

In [24]:
df

Unnamed: 0,age,gender,time_available,route_type,price,difficulty,accompaniment,transport,selection_map
0,18-35,Mujer,3 - 4,Patrimonio,Gratis,Alta,Solo,A Pie,12
1,0-18,Mujer,1 - 2,Historica,+50 Euros,Baja,Familia,Bicicleta,9
2,18-35,Mujer,1 - 2,Patrimonio,Gratis,Baja,Pareja,Bicicleta,10
3,18-35,Hombre,1 - 2,Turistica,Gratis,Alta,Pareja,Bicicleta,20
4,18-35,Hombre,1 - 2,Turistica,1-50 Euros,Baja,Solo,Pie,9
...,...,...,...,...,...,...,...,...,...
29995,+45,Mujer,3 - 4,Turistica,1-50 Euros,Baja,Solo,Bicicleta,7
29996,18-35,Hombre,3 - 4,Turistica,1-50 Euros,Baja,Pareja,Pie,16
29997,35-45,Mujer,1 - 2,Literaria,Gratis,Baja,Solo,Bicicleta,9
29998,18-35,Mujer,3 - 4,Historica,1-50 Euros,Baja,Pareja,A Pie,11


In [25]:
df['transport'] = df['transport'].str.replace('A','').str.strip()

In [26]:
df.info()

df.columns

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 0 to 29999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30000 non-null  object
 1   gender          30000 non-null  object
 2   time_available  30000 non-null  object
 3   route_type      30000 non-null  object
 4   price           30000 non-null  object
 5   difficulty      30000 non-null  object
 6   accompaniment   30000 non-null  object
 7   transport       30000 non-null  object
 8   selection_map   30000 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 2.3+ MB


Index(['age', 'gender', 'time_available', 'route_type', 'price', 'difficulty',
       'accompaniment', 'transport', 'selection_map'],
      dtype='object')

In [27]:
df = pd.get_dummies(df, prefix=['age', 'gender', 'type', 'diff', 'comp', 'trans'], columns=['age', 'gender', 'route_type', 'difficulty','accompaniment', 'transport'])

In [28]:
df.price.unique()

array(['Gratis', '+50 Euros', '1-50 Euros'], dtype=object)

In [29]:
def mapping(x):
    if x == 'Gratis':
        return 1
    elif x == '1-50 Euros':
        return 2
    elif x == '+50 Euros':
        return 3
    else:
        return 9999

df['price'] = df['price'].apply(mapping)

In [30]:
df['time_available'] =  df.time_available.str.replace(' ','').str.split('-')

In [31]:
df['time_available'] = df.time_available.map(lambda x: ( ( int(x[0]) + int(x[1]) )/ 2))


In [32]:
df

Unnamed: 0,time_available,price,selection_map,age_+45,age_0-18,age_18-35,age_35-45,gender_Hombre,gender_Mujer,gender_Prefiero no decirlo,...,type_Patrimonio,type_Turistica,diff_Alta,diff_Baja,comp_Amigos,comp_Familia,comp_Pareja,comp_Solo,trans_Bicicleta,trans_Pie
0,3.5,1,12,0,0,1,0,0,1,0,...,1,0,1,0,0,0,0,1,0,1
1,1.5,3,9,0,1,0,0,0,1,0,...,0,0,0,1,0,1,0,0,1,0
2,1.5,1,10,0,0,1,0,0,1,0,...,1,0,0,1,0,0,1,0,1,0
3,1.5,1,20,0,0,1,0,1,0,0,...,0,1,1,0,0,0,1,0,1,0
4,1.5,2,9,0,0,1,0,1,0,0,...,0,1,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,3.5,2,7,1,0,0,0,0,1,0,...,0,1,0,1,0,0,0,1,1,0
29996,3.5,2,16,0,0,1,0,1,0,0,...,0,1,0,1,0,0,1,0,0,1
29997,1.5,1,9,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,1,1,0
29998,3.5,2,11,0,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1


In [33]:
df.to_csv('isma_synthetic_data.csv')

In [34]:
df.columns

Index(['time_available', 'price', 'selection_map', 'age_+45', 'age_0-18',
       'age_18-35', 'age_35-45', 'gender_Hombre', 'gender_Mujer',
       'gender_Prefiero no decirlo', 'type_Historica', 'type_Literaria',
       'type_Patrimonio', 'type_Turistica', 'diff_Alta', 'diff_Baja',
       'comp_Amigos', 'comp_Familia', 'comp_Pareja', 'comp_Solo',
       'trans_Bicicleta', 'trans_Pie'],
      dtype='object')

# DATA PREPARATION

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import plot_partial_dependence, permutation_importance
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split


In [36]:
X = df.drop(columns=['selection_map'], axis=1)
y = df[['selection_map']]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
rnd_clf = RandomForestClassifier(n_estimators=500,
                                 max_leaf_nodes=16,
                                 random_state=42)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

  rnd_clf.fit(X_train, y_train)


In [40]:
accuracy_score(y_test, y_pred_rf)

0.051