Data loading

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [45]:
X = pd.read_csv('/mnt/c/PROJECTS/PRO/training_set.csv', index_col=0)
y = X['label']
y = y.fillna('None')  
X = X.drop(columns=['label']) 
X = X.astype(float)

In [46]:
X_val = pd.read_csv('/mnt/c/PROJECTS/PRO/validation_set.csv', index_col=0)
y_val = X_val['label']
y_val = y_val.fillna('None') 
X_val = X_val.drop(columns=['label']) 
X_val = X_val.astype(float)

In [47]:
X_test = pd.read_csv('/mnt/c/PROJECTS/PRO/test_set.csv', index_col=0)
y_test = X_test['label']
y_test = y_test.fillna('None') 
X_test = X_test.drop(columns=['label']) 
X_test = X_test.astype(float)

Random Forest without tuning and checking feature importance

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [49]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X, y)

print(y_train.value_counts())


label
topics            14477
None              14477
negative          14477
affirmative       14477
relative          14477
yn_question       14477
conditional       14477
doubt_question    14477
emphasis          14477
wh_question       14477
Name: count, dtype: int64


In [50]:
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)

perm_importance = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42)

importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': perm_importance.importances_mean
}).sort_values(by='Importance', ascending=False)

print(importances.head(30))

                          Feature  Importance
239              79z_face_contour    0.002183
91              30y_right_eyebrow    0.001718
14                    4z_left_eye    0.001396
185                     61z_mouth    0.001324
48               16x_left_eyebrow    0.001288
173                     57z_mouth    0.001253
168                     56x_mouth    0.001181
55               18y_left_eyebrow    0.001181
158                     52z_mouth    0.001110
248              82z_face_contour    0.001110
182                     60z_mouth    0.001074
17                    5z_left_eye    0.001038
16                    5y_left_eye    0.001038
89              29z_right_eyebrow    0.000966
18                    6x_left_eye    0.000931
216              72x_face_contour    0.000931
109                      36y_nose    0.000895
166                     55y_mouth    0.000859
285  95x_line_above_right_eyebrow    0.000859
8                     2z_left_eye    0.000823
103             34y_right_eyebrow 

In [51]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(report)
print(f"Accuracy: {accuracy:.2f}")


                precision    recall  f1-score   support

          None       0.97      0.93      0.95      1788
   affirmative       0.78      0.92      0.84        83
   conditional       0.82      0.88      0.85        95
doubt_question       0.94      0.96      0.95       148
      emphasis       0.91      0.93      0.92        87
      negative       0.89      0.94      0.91       125
      relative       0.91      0.92      0.91       147
        topics       0.87      0.89      0.88        90
   wh_question       0.88      0.92      0.90       114
   yn_question       0.82      0.95      0.88       117

      accuracy                           0.93      2794
     macro avg       0.88      0.92      0.90      2794
  weighted avg       0.93      0.93      0.93      2794

Accuracy: 0.93


Adding features

In [29]:
X_added = X.copy()

In [30]:
def calculate_distances(points):
    distances = []
    for i in range(len(points) - 1):
        distances.append(np.sqrt((points[i+1][0] - points[i][0])**2 +
                                 (points[i+1][1] - points[i][1])**2 +
                                 (points[i+1][2] - points[i][2])**2))
    return distances

In [52]:
import numpy as np
import pandas as pd

def calculate_total_distance(df, lower_bound, upper_bound, suffix, axis_labels=['x', 'y', 'z']):
    total_distances = []  

    for _, row in df.iterrows():
        points = [
            tuple(row[f"{i}{axis}{suffix}"] for axis in axis_labels)
            for i in range(lower_bound, upper_bound + 1)
        ]

        total_distance = sum(
            np.sqrt(sum((p2 - p1) ** 2 for p1, p2 in zip(points[i], points[i + 1])))
            for i in range(len(points) - 1)
        )
        total_distances.append(total_distance)

    return total_distances


In [53]:
def calculate_distance_between_two_points(df, point1, suffix1, point2, suffix2, axis_labels=["x", "y", "z"]):
    distances = []
    for _, row in df.iterrows():
        coords1 = [row[f"{point1}{axis}{suffix1}"] for axis in axis_labels]
        coords2 = [row[f"{point2}{axis}{suffix2}"] for axis in axis_labels]

        distance = np.sqrt(sum((c2 - c1) ** 2 for c1, c2 in zip(coords1, coords2)))
        distances.append(distance)
    return distances

In [54]:
X_added["left_eyebrow_length"] = calculate_total_distance(X, 16, 20, "_left_eyebrow")

X_added["right_eyebrow_length"] = calculate_total_distance(X, 26, 30, "_right_eyebrow")

X_added["mouth_length"] = calculate_distance_between_two_points(X, 48, "_mouth", 54, "_mouth")

X_added["left_eye_width"] = calculate_distance_between_two_points(X, 4, "_left_eye", 0, "_left_eye")

X_added["right_eye_width"] = calculate_distance_between_two_points(X, 12, "_right_eye", 8, "_right_eye")

X_added["nose_to_left_eye"] = calculate_distance_between_two_points(X, 89, "_nose_tip", 0, "_left_eye")

X_added["nose_to_right_eye"] = calculate_distance_between_two_points(X, 89, "_nose_tip", 8, "_right_eye")

nose_to_51 = calculate_distance_between_two_points(X, 89, "_nose_tip", 51, "_mouth")
nose_to_57 = calculate_distance_between_two_points(X, 89, "_nose_tip", 57, "_mouth")
X_added["nose_to_mouth_center"] = [(d1 + d2) / 2 for d1, d2 in zip(nose_to_51, nose_to_57)]


In [55]:
X_val_added = X_val.copy()
X_val_added["left_eyebrow_length"] = calculate_total_distance(X_val, 16, 20, "_left_eyebrow")
X_val_added["right_eyebrow_length"] = calculate_total_distance(X_val, 26, 30, "_right_eyebrow")
X_val_added["mouth_length"] = calculate_distance_between_two_points(X_val, 48, "_mouth", 54, "_mouth")
X_val_added["left_eye_width"] = calculate_distance_between_two_points(X_val, 4, "_left_eye", 0, "_left_eye")
X_val_added["right_eye_width"] = calculate_distance_between_two_points(X_val, 12, "_right_eye", 8, "_right_eye")
X_val_added["nose_to_left_eye"] = calculate_distance_between_two_points(X_val, 89, "_nose_tip", 0, "_left_eye")
X_val_added["nose_to_right_eye"] = calculate_distance_between_two_points(X_val, 89, "_nose_tip", 8, "_right_eye")

nose_to_51_val = calculate_distance_between_two_points(X_val, 89, "_nose_tip", 51, "_mouth")
nose_to_57_val = calculate_distance_between_two_points(X_val, 89, "_nose_tip", 57, "_mouth")
X_val_added["nose_to_mouth_center"] = [(d1 + d2) / 2 for d1, d2 in zip(nose_to_51_val, nose_to_57_val)]

X_test_added = X_test.copy()
X_test_added["left_eyebrow_length"] = calculate_total_distance(X_test, 16, 20, "_left_eyebrow")
X_test_added["right_eyebrow_length"] = calculate_total_distance(X_test, 26, 30, "_right_eyebrow")
X_test_added["mouth_length"] = calculate_distance_between_two_points(X_test, 48, "_mouth", 54, "_mouth")
X_test_added["left_eye_width"] = calculate_distance_between_two_points(X_test, 4, "_left_eye", 0, "_left_eye")
X_test_added["right_eye_width"] = calculate_distance_between_two_points(X_test, 12, "_right_eye", 8, "_right_eye")
X_test_added["nose_to_left_eye"] = calculate_distance_between_two_points(X_test, 89, "_nose_tip", 0, "_left_eye")
X_test_added["nose_to_right_eye"] = calculate_distance_between_two_points(X_test, 89, "_nose_tip", 8, "_right_eye")

nose_to_51_test = calculate_distance_between_two_points(X_test, 89, "_nose_tip", 51, "_mouth")
nose_to_57_test = calculate_distance_between_two_points(X_test, 89, "_nose_tip", 57, "_mouth")
X_test_added["nose_to_mouth_center"] = [(d1 + d2) / 2 for d1, d2 in zip(nose_to_51_test, nose_to_57_test)]

In [35]:
def calculate_angle_between_three_points(df, point1, suffix1, point2, suffix2, point3, suffix3, axis_labels=["x", "y", "z"]):
    angles = []
    
    for _, row in df.iterrows():
        coords1 = np.array([row[f"{point1}{axis}{suffix1}"] for axis in axis_labels])
        coords2 = np.array([row[f"{point2}{axis}{suffix2}"] for axis in axis_labels])
        coords3 = np.array([row[f"{point3}{axis}{suffix3}"] for axis in axis_labels])

        vector1 = coords1 - coords2  
        vector2 = coords3 - coords2  

        dot_product = np.dot(vector1, vector2)
        norm1 = np.linalg.norm(vector1)
        norm2 = np.linalg.norm(vector2)

        angle = np.arccos(dot_product / (norm1 * norm2))
        angles.append(np.degrees(angle))  

    return angles


In [36]:
X_added["left_eyebrow_angle"] = calculate_angle_between_three_points(
    X, 16, "_left_eyebrow", 18, "_left_eyebrow", 20, "_left_eyebrow"
)

X_added["right_eyebrow_angle"] = calculate_angle_between_three_points(
    X, 26, "_right_eyebrow", 28, "_right_eyebrow", 30, "_right_eyebrow"
)
X_added["nose_to_eyes_angle"] = calculate_angle_between_three_points(
    X, 0, "_left_eye", 89, "_nose_tip", 8, "_right_eye"
)

X_added["mouth_angle"] = calculate_angle_between_three_points(
    X, 48, "_mouth", 51, "_mouth", 54, "_mouth"
)



In [58]:

X_val_added["left_eyebrow_angle"] = calculate_angle_between_three_points(
    X_val, 16, "_left_eyebrow", 18, "_left_eyebrow", 20, "_left_eyebrow"
)

X_val_added["right_eyebrow_angle"] = calculate_angle_between_three_points(
    X_val, 26, "_right_eyebrow", 28, "_right_eyebrow", 30, "_right_eyebrow"
)

X_val_added["nose_to_eyes_angle"] = calculate_angle_between_three_points(
    X_val, 0, "_left_eye", 89, "_nose_tip", 8, "_right_eye"
)

X_val_added["mouth_angle"] = calculate_angle_between_three_points(
    X_val, 48, "_mouth", 51, "_mouth", 54, "_mouth"
)

X_test_added["left_eyebrow_angle"] = calculate_angle_between_three_points(
    X_test, 16, "_left_eyebrow", 18, "_left_eyebrow", 20, "_left_eyebrow"
)

X_test_added["right_eyebrow_angle"] = calculate_angle_between_three_points(
    X_test, 26, "_right_eyebrow", 28, "_right_eyebrow", 30, "_right_eyebrow"
)

X_test_added["nose_to_eyes_angle"] = calculate_angle_between_three_points(
    X_test, 0, "_left_eye", 89, "_nose_tip", 8, "_right_eye"
)

X_test_added["mouth_angle"] = calculate_angle_between_three_points(
    X_test, 48, "_mouth", 51, "_mouth", 54, "_mouth"
)

In [59]:
print(X_test_added.head())

       0x_left_eye  0y_left_eye  0z_left_eye  1x_left_eye  1y_left_eye  1z_left_eye  2x_left_eye  2y_left_eye  2z_left_eye  3x_left_eye  3y_left_eye  3z_left_eye  4x_left_eye  4y_left_eye  4z_left_eye  5x_left_eye  5y_left_eye  5z_left_eye  6x_left_eye  6y_left_eye  6z_left_eye  7x_left_eye  7y_left_eye  7z_left_eye  8x_right_eye  8y_right_eye  8z_right_eye  9x_right_eye  9y_right_eye  9z_right_eye  10x_right_eye  10y_right_eye  10z_right_eye  11x_right_eye  11y_right_eye  11z_right_eye  12x_right_eye  12y_right_eye  12z_right_eye  13x_right_eye  13y_right_eye  13z_right_eye  14x_right_eye  14y_right_eye  14z_right_eye  15x_right_eye  15y_right_eye  15z_right_eye  16x_left_eyebrow  16y_left_eyebrow  16z_left_eyebrow  17x_left_eyebrow  17y_left_eyebrow  17z_left_eyebrow  18x_left_eyebrow  18y_left_eyebrow  18z_left_eyebrow  19x_left_eyebrow  19y_left_eyebrow  19z_left_eyebrow  20x_left_eyebrow  20y_left_eyebrow  20z_left_eyebrow  21x_left_eyebrow  21y_left_eyebrow  21z_left_eyebrow  \
3

Checking the impact of new features

In [37]:
new_columns = [
    "left_eyebrow_length",
    "right_eyebrow_length",
    "mouth_length",
    "left_eye_width",
    "right_eye_width",
    "nose_to_left_eye",
    "nose_to_right_eye",
    "nose_to_mouth_center",
    "left_eyebrow_angle",
    "right_eyebrow_angle",
    "nose_to_eyes_angle",
    "mouth_angle"
]


X_new_features = X_added[new_columns]


In [60]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_added, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new_features, y, test_size=0.2, random_state=42)


smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)



# Ważność cech
feature_importance = pd.DataFrame({
    'Feature': X_new_features.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance)


                 Feature  Importance
5       nose_to_left_eye    0.116420
0    left_eyebrow_length    0.104753
6      nose_to_right_eye    0.102294
2           mouth_length    0.090284
3         left_eye_width    0.089530
10    nose_to_eyes_angle    0.086319
7   nose_to_mouth_center    0.081780
1   right_eyebrow_length    0.075902
4        right_eye_width    0.071591
8     left_eyebrow_angle    0.064779
11           mouth_angle    0.058763
9    right_eyebrow_angle    0.057584


In [39]:
y_pred = rf.predict(X_test)

from sklearn.metrics import accuracy_score


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.80


In [61]:
from sklearn.ensemble import RandomForestClassifier


smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_added, y)

rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_added.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance.head(30))

                         Feature  Importance
224             74z_face_contour    0.011153
277  92y_line_above_left_eyebrow    0.009545
282  94x_line_above_left_eyebrow    0.008532
227             75z_face_contour    0.008273
274  91y_line_above_left_eyebrow    0.008163
236             78z_face_contour    0.007291
305             nose_to_left_eye    0.006946
191                    63z_mouth    0.006916
154                    51y_mouth    0.006516
239             79z_face_contour    0.006504
197                    65z_mouth    0.006494
69              23x_left_eyebrow    0.006462
230             76z_face_contour    0.006438
271  90y_line_above_left_eyebrow    0.006415
194                    64z_mouth    0.006357
58              19y_left_eyebrow    0.006274
196                    65y_mouth    0.006247
70              23y_left_eyebrow    0.006213
276  92x_line_above_left_eyebrow    0.006170
188                    62z_mouth    0.006136
95             31z_right_eyebrow    0.005823
232       

In [63]:
y_pred = rf.predict(X_test_added)

from sklearn.metrics import accuracy_score


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.93


In [64]:
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

          None       0.97      0.93      0.95      1788
   affirmative       0.80      0.92      0.85        83
   conditional       0.81      0.89      0.85        95
doubt_question       0.94      0.96      0.95       148
      emphasis       0.89      0.94      0.92        87
      negative       0.91      0.96      0.93       125
      relative       0.88      0.93      0.90       147
        topics       0.89      0.88      0.88        90
   wh_question       0.88      0.92      0.90       114
   yn_question       0.84      0.96      0.90       117

      accuracy                           0.93      2794
     macro avg       0.88      0.93      0.90      2794
  weighted avg       0.94      0.93      0.93      2794



In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd

scaler = StandardScaler()
X_train = scaler.fit_transform(X_added) 
X_test = scaler.transform(X_test_added)       


smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y)

rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)


feature_importance = pd.DataFrame({
    'Feature': X_added.columns,  
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)


print(feature_importance.head(30))


                          Feature  Importance
224              74z_face_contour    0.009113
227              75z_face_contour    0.008859
280   93y_line_above_left_eyebrow    0.008531
58               19y_left_eyebrow    0.008137
197                     65z_mouth    0.007796
221              73z_face_contour    0.007722
277   92y_line_above_left_eyebrow    0.007672
233              77z_face_contour    0.007456
154                     51y_mouth    0.007440
305              nose_to_left_eye    0.007378
279   93x_line_above_left_eyebrow    0.007310
161                     53z_mouth    0.007126
299  99z_line_above_right_eyebrow    0.006687
301          right_eyebrow_length    0.006626
70               23y_left_eyebrow    0.006568
169                     56y_mouth    0.006322
230              76z_face_contour    0.006248
196                     65y_mouth    0.006193
236              78z_face_contour    0.006161
274   91y_line_above_left_eyebrow    0.006149
292  97y_line_above_right_eyebrow 

In [69]:
y_pred = rf.predict(X_test)

from sklearn.metrics import accuracy_score


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.93


In [70]:
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

          None       0.97      0.94      0.95      1788
   affirmative       0.85      0.89      0.87        83
   conditional       0.83      0.89      0.86        95
doubt_question       0.93      0.95      0.94       148
      emphasis       0.91      0.92      0.91        87
      negative       0.91      0.95      0.93       125
      relative       0.89      0.92      0.91       147
        topics       0.88      0.86      0.87        90
   wh_question       0.87      0.95      0.91       114
   yn_question       0.84      0.95      0.89       117

      accuracy                           0.93      2794
     macro avg       0.89      0.92      0.90      2794
  weighted avg       0.94      0.93      0.94      2794



Random Search

In [76]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np
import random

seed = 48
random.seed(seed)
np.random.seed(seed)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_added)
X_val = scaler.transform(X_val_added)
X_test = scaler.transform(X_test_added)

smote = SMOTE(random_state=seed)
X_train, y_train = smote.fit_resample(X_train, y)

param_distributions = {
    'pca__n_components': [0.8, 0.85, 0.9, 0.95, 0.99],
    'rf__n_estimators': [50, 100, 150, 200],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__criterion': ['gini', 'entropy'],
}

def get_random_params(param_distributions):
    return {key: random.choice(values) for key, values in param_distributions.items()}



n_iter = 20
high_accuracy_models = []

for i in range(n_iter):
    params = get_random_params(param_distributions)
    pca = PCA(n_components=params['pca__n_components'])
    rf = RandomForestClassifier(
        n_estimators=params['rf__n_estimators'],
        max_depth=params['rf__max_depth'],
        min_samples_leaf=params['rf__min_samples_leaf'],
        criterion=params['rf__criterion'],
        random_state=seed,
    )
    X_train_pca = pca.fit_transform(X_train)
    X_val_pca = pca.transform(X_val)
    rf.fit(X_train_pca, y_train)
    y_val_pred = rf.predict(X_val_pca)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    if val_accuracy >= 0.90:
        high_accuracy_models.append((rf, pca, val_accuracy))
    print(f"Iteration {i+1}/{n_iter}:")
    print(f"Parameters: {params}")
    print(f"Validation Accuracy: {val_accuracy:.4f}\n")

print(f"\nNumber of models with validation accuracy >= 90%: {len(high_accuracy_models)}")

for idx, (model, pca, val_acc) in enumerate(high_accuracy_models, start=1):
    X_test_pca = pca.transform(X_test)
    y_test_pred = model.predict(X_test_pca)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"\nModel {idx}:")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print("\nClassification Report on Test Set:")
    print(classification_report(y_test, y_test_pred))


Iteration 1/20:
Parameters: {'pca__n_components': 0.99, 'rf__n_estimators': 150, 'rf__max_depth': 10, 'rf__min_samples_leaf': 4, 'rf__criterion': 'entropy'}
Validation Accuracy: 0.7183

Iteration 2/20:
Parameters: {'pca__n_components': 0.99, 'rf__n_estimators': 100, 'rf__max_depth': 30, 'rf__min_samples_leaf': 1, 'rf__criterion': 'gini'}
Validation Accuracy: 0.8890

Iteration 3/20:
Parameters: {'pca__n_components': 0.8, 'rf__n_estimators': 200, 'rf__max_depth': 10, 'rf__min_samples_leaf': 1, 'rf__criterion': 'gini'}
Validation Accuracy: 0.5601

Iteration 4/20:
Parameters: {'pca__n_components': 0.95, 'rf__n_estimators': 100, 'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__criterion': 'entropy'}
Validation Accuracy: 0.8747

Iteration 5/20:
Parameters: {'pca__n_components': 0.8, 'rf__n_estimators': 100, 'rf__max_depth': 30, 'rf__min_samples_leaf': 4, 'rf__criterion': 'gini'}
Validation Accuracy: 0.6779

Iteration 6/20:
Parameters: {'pca__n_components': 0.85, 'rf__n_estimators': 50,

In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np
import random

seed = 48
random.seed(seed)
np.random.seed(seed)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_added)
X_val = scaler.transform(X_val_added)
X_test = scaler.transform(X_test_added)

smote = SMOTE(random_state=seed)
X_train, y_train = smote.fit_resample(X_train, y)

param_distributions = {
    'rf__n_estimators': [50, 100, 150, 200],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__criterion': ['gini', 'entropy'],
}

def get_random_params(param_distributions):
    return {key: random.choice(values) for key, values in param_distributions.items()}


n_iter = 20
high_accuracy_models = []

for i in range(n_iter):
    params = get_random_params(param_distributions)
    rf = RandomForestClassifier(
        n_estimators=params['rf__n_estimators'],
        max_depth=params['rf__max_depth'],
        min_samples_leaf=params['rf__min_samples_leaf'],
        criterion=params['rf__criterion'],
        random_state=seed,
    )
    rf.fit(X_train, y_train)
    y_val_pred = rf.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    if val_accuracy >= 0.90:
        high_accuracy_models.append((rf, val_accuracy))
    print(f"Iteration {i+1}/{n_iter}:")
    print(f"Parameters: {params}")
    print(f"Validation Accuracy: {val_accuracy:.4f}\n")

print(f"\nNumber of models with validation accuracy >= 90%: {len(high_accuracy_models)}")

for idx, (model, val_acc) in enumerate(high_accuracy_models, start=1):
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"\nModel {idx}:")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print("\nClassification Report on Test Set:")
    print(classification_report(y_test, y_test_pred))


Iteration 1/20:
Parameters: {'rf__n_estimators': 150, 'rf__max_depth': 10, 'rf__min_samples_leaf': 4, 'rf__criterion': 'entropy'}
Validation Accuracy: 0.7359

Iteration 2/20:
Parameters: {'rf__n_estimators': 100, 'rf__max_depth': 30, 'rf__min_samples_leaf': 1, 'rf__criterion': 'gini'}
Validation Accuracy: 0.9341

Iteration 3/20:
Parameters: {'rf__n_estimators': 50, 'rf__max_depth': 30, 'rf__min_samples_leaf': 4, 'rf__criterion': 'gini'}
Validation Accuracy: 0.9148

Iteration 4/20:
Parameters: {'rf__n_estimators': 100, 'rf__max_depth': 10, 'rf__min_samples_leaf': 2, 'rf__criterion': 'gini'}
Validation Accuracy: 0.6074

Iteration 5/20:
Parameters: {'rf__n_estimators': 50, 'rf__max_depth': None, 'rf__min_samples_leaf': 2, 'rf__criterion': 'gini'}
Validation Accuracy: 0.9266

Iteration 6/20:
Parameters: {'rf__n_estimators': 100, 'rf__max_depth': 30, 'rf__min_samples_leaf': 4, 'rf__criterion': 'gini'}
Validation Accuracy: 0.9159

Iteration 7/20:
Parameters: {'rf__n_estimators': 100, 'rf__ma

Best model

In [79]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd

scaler = StandardScaler()
X_train = scaler.fit_transform(X_added) 
X_test = scaler.transform(X_test_added)       


smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y)

rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)



In [80]:
y_pred = rf.predict(X_test)

from sklearn.metrics import accuracy_score


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9349


In [81]:
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

          None       0.97      0.94      0.95      1788
   affirmative       0.85      0.89      0.87        83
   conditional       0.83      0.89      0.86        95
doubt_question       0.93      0.95      0.94       148
      emphasis       0.91      0.92      0.91        87
      negative       0.91      0.95      0.93       125
      relative       0.89      0.92      0.91       147
        topics       0.88      0.86      0.87        90
   wh_question       0.87      0.95      0.91       114
   yn_question       0.84      0.95      0.89       117

      accuracy                           0.93      2794
     macro avg       0.89      0.92      0.90      2794
  weighted avg       0.94      0.93      0.94      2794



In [82]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}