In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('student.csv')

In [2]:
df['sex'] = df['sex'].replace({'F': 0, 'M': 1})
df['Pstatus'] = df['Pstatus'].replace({'A': 0, 'T': 1})

In [3]:
selected_columns_D = ['sex', 'age', 'Pstatus', 'famrel', 'Medu', 'Mjob', 'Fedu', 'Fjob', 'goout', 'health', 'Dalc']
# selected_columns_W = ['sex', 'age', 'Pstatus', 'famrel', 'Medu', 'Mjob', 'Fedu', 'Fjob', 'Walc']

df_D = df[selected_columns_D].copy()
# df_W = df[selected_columns_W].copy()


In [4]:
df_D['Dalc'] = df_D['Dalc'].astype(str)
print(df_D.dtypes)
df_D.to_csv('df_D.csv')

sex         int64
age         int64
Pstatus     int64
famrel      int64
Medu        int64
Mjob       object
Fedu        int64
Fjob       object
goout       int64
health      int64
Dalc       object
dtype: object


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support

X = df_D[['sex', 'age', 'Pstatus', 'famrel', 'Medu', 'Mjob', 'Fedu', 'Fjob', 'goout', 'health']]
X = pd.get_dummies(X, columns=['Mjob', 'Fjob'])

y = df_D['Dalc']


In [None]:
# X

In [7]:
from sklearn.utils import Bunch

dataset = {
    'data': X,
    'target': y,
    'DESCR': 'Student alcohol consumption during weekday',
    'feature_names': ['sex', 'age', 'Pstatus', 'famrel', 'Medu', 'Fedu', 'goout', 'health',
                      'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher',
                      'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher'],
    'target_names': ['very low', 'low', 'moderate', 'high', 'very high'],
}

bunch = Bunch(**dataset)

X = bunch.data
y = bunch.target

feature_names = bunch.feature_names
class_names = bunch.target_names

In [None]:
# from sklearn.model_selection import GridSearchCV
# param_grid = { 
#     'n_estimators': [5, 10, 15, 20, 25, 50, 100, 150], 
#     'max_features': ['sqrt', 'log2', None], 
#     'max_depth': [2,3,4,5,6,7,8,9], 
# #     'max_leaf_nodes': [3, 6, 9], 
# } 
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# grid_search = GridSearchCV(RandomForestClassifier(), 
#                            param_grid=param_grid) 
# grid_search.fit(X_train, y_train) 
# print(grid_search.best_estimator_) 

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

# random_search = RandomizedSearchCV(RandomForestClassifier(), 
#                                    param_grid) 
# random_search.fit(X_train, y_train) 
# print(random_search.best_estimator_) 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(max_depth=9, n_estimators=50, class_weight='balanced', random_state=30)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("accuracy:", acc)


accuracy: 0.8325358851674641


In [19]:
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 0.8086455660108355
Recall: 0.6758933119860273
F1 Score: 0.7275856219252446


In [11]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[144   5   1   0   1]
 [ 20  18   1   0   0]
 [  2   0   5   0   0]
 [  1   0   0   3   0]
 [  0   4   0   0   4]]


In [17]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True)

report_df = pd.DataFrame(report).transpose()

print("Classification Report:")
print(report_df)


Classification Report:
              precision    recall  f1-score     support
very low       0.862275  0.953642  0.905660  151.000000
low            0.666667  0.461538  0.545455   39.000000
moderate       0.714286  0.714286  0.714286    7.000000
high           1.000000  0.750000  0.857143    4.000000
very high      0.800000  0.500000  0.615385    8.000000
accuracy       0.832536  0.832536  0.832536    0.832536
macro avg      0.808646  0.675893  0.727586  209.000000
weighted avg   0.821070  0.832536  0.819996  209.000000


In [20]:
# np.array(y_test)

array(['5', '2', '1', '1', '1', '2', '1', '1', '1', '1', '1', '1', '1',
       '5', '1', '2', '2', '1', '2', '1', '1', '1', '1', '1', '1', '2',
       '3', '1', '1', '1', '2', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2', '1', '2',
       '5', '1', '2', '1', '2', '1', '1', '2', '1', '1', '1', '1', '4',
       '1', '1', '2', '1', '1', '1', '2', '1', '1', '1', '1', '1', '1',
       '2', '1', '1', '2', '1', '1', '3', '1', '3', '1', '2', '4', '2',
       '1', '1', '1', '2', '1', '3', '1', '1', '1', '1', '1', '1', '1',
       '1', '2', '2', '5', '1', '1', '1', '1', '2', '1', '2', '2', '5',
       '1', '1', '1', '5', '2', '1', '1', '1', '1', '5', '1', '1', '3',
       '2', '2', '1', '3', '1', '1', '5', '1', '1', '3', '1', '1', '1',
       '1', '1', '1', '1', '2', '2', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '2', '1', '1', '1', '2', '1

In [22]:
# df = pd.DataFrame({'y_test': np.array(y_test)})

# df.to_csv('y_test.csv', index=False)

In [21]:
# y_pred

array(['5', '2', '1', '1', '1', '1', '1', '5', '1', '1', '1', '1', '1',
       '5', '1', '1', '2', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '3', '1', '1', '1', '2', '2', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '2', '1', '1', '1', '2', '1', '1', '2',
       '1', '1', '1', '1', '2', '1', '1', '1', '1', '1', '1', '1', '2',
       '2', '1', '2', '1', '2', '1', '1', '1', '1', '1', '2', '1', '4',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '3', '1', '1', '1', '1', '1', '3', '1', '1', '1', '2', '4', '2',
       '1', '1', '1', '2', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '2', '2', '2', '1', '1', '1', '1', '1', '1', '1', '1', '2',
       '1', '1', '1', '2', '2', '1', '1', '1', '1', '5', '1', '1', '3',
       '1', '2', '1', '3', '1', '1', '5', '1', '1', '3', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '3', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2', '1

In [23]:
# df = pd.DataFrame({'y_pred': y_pred})

# df.to_csv('y_pred.csv', index=False)

In [None]:
# f1_score?

In [None]:
tree_to_plot = 2  

plt.figure(figsize=(30,20))
plt.rcParams['lines.linewidth'] = 0.3
plot_tree(clf.estimators_[tree_to_plot], feature_names=feature_names, filled=True, class_names=class_names)

In [None]:
def plot_feature_importances(model):
    n_features = bunch.data.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), bunch.feature_names)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")

plot_feature_importances(clf)


In [None]:
# X_train = X_train.to_numpy()

In [None]:

# import mglearn
# fig, axes = plt.subplots(2, 3, figsize=(20, 10))
# for i, (ax, tree) in enumerate(zip(axes.ravel(), clf.estimators_)):
#     ax.set_title("tree {}".format(i))
#     mglearn.plots.plot_tree_partition(X_train, y_train, tree, ax=ax)

# mglearn.plots.plot_2d_separator(clf, X_train, fill=True, ax=axes[-1, -1], alpha=.4)
# axes[-1, -1].set_title("Random Forest")
# mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train)


In [None]:
# print(type(X_train))
# print(X_train.shape)
