In [1]:
# %% load packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# %% load dataset and print 1st five rows
dataset = pd.read_csv('takingawalk_dataset.csv', sep=';')
print(dataset.head())

   Week Outlook Humidity Wind  Label
0     1   Rainy     High  Yes      0
1     2   Sunny   Normal   No      1
2     3   Sunny   Normal  Yes      1
3     4   Sunny     High  Yes      0
4     5   Rainy   Normal  Yes      0


In [3]:
# %% prepare the data
X = dataset.drop(columns=['Label', 'Week'])
y = dataset['Label']
X = pd.get_dummies(X)
print(X.columns)

Index(['Outlook_Rainy', 'Outlook_Sunny', 'Humidity_High', 'Humidity_Normal',
       'Wind_No', 'Wind_Yes'],
      dtype='object')


In [4]:
# %% split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.3, shuffle=True, random_state=42)

In [5]:
# %% specify and train the model
clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, random_state=42)

In [6]:
# %% use the model to predict values
y_pred = clf.predict(X_test)

In [7]:
# %% print the confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[10  1]
 [ 0  5]]


In [8]:
# %% print accuracy
accuracy_score(y_test, y_pred)
# console output: 0.9375

0.9375

In [9]:
# %% extract feature importances
feature_scores = pd.Series(clf.feature_importances_,
    index=X_train.columns).sort_values(ascending=False)
print('Feature Scores:')
print(feature_scores)

Feature Scores:
Humidity_Normal    0.193512
Outlook_Sunny      0.192283
Humidity_High      0.168927
Wind_No            0.151238
Outlook_Rainy      0.147944
Wind_Yes           0.146097
dtype: float64
