In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Data Preprocessing:

Load the dataset.

In [2]:
df = pd.read_csv('gender_classification_v72.csv')

In [3]:
df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


In [4]:
df.shape

(5001, 8)

### Convert categorical data ('gender') to numerical values.

In [5]:
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Male' else 0)

In [6]:
df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,1
1,0,14.0,5.4,0,0,1,0,0
2,0,11.8,6.3,1,1,1,1,1
3,0,14.4,6.1,0,1,1,1,1
4,1,13.5,5.9,0,0,0,0,0


### Split the dataset into features (X) and the target variable (y)

In [7]:
y = df['gender']

In [8]:
X = df.drop('gender',axis=1)

In [9]:
y

0       1
1       0
2       1
3       1
4       0
       ..
4996    0
4997    0
4998    0
4999    0
5000    1
Name: gender, Length: 5001, dtype: int64

In [10]:
X

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
0,1,11.8,6.1,1,0,1,1
1,0,14.0,5.4,0,0,1,0
2,0,11.8,6.3,1,1,1,1
3,0,14.4,6.1,0,1,1,1
4,1,13.5,5.9,0,0,0,0
...,...,...,...,...,...,...,...
4996,1,13.6,5.1,0,0,0,0
4997,1,11.9,5.4,0,0,0,0
4998,1,12.9,5.7,0,0,0,0
4999,1,13.2,6.2,0,0,0,0


### Train-Test Split:


Split the data into training and testing sets (e.g., 80% training, 20% testing).
Print the lengths of the training and testing sets.

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=1)

In [13]:
X_train.shape

(4000, 7)

In [14]:
X_test.shape

(1001, 7)

In [15]:
y_train.shape

(4000,)

In [16]:
y_test.shape

(1001,)

### Scaling Data:


Standardize the features using StandardScaler from scikit-learn.
Print the first five rows of the scaled features.

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()

In [19]:
X_train = scaler.fit_transform(X_train)

In [20]:
X_test = scaler.transform(X_test)

In [21]:
X_train[0:5]

array([[ 0.38655567, -0.79194941,  1.212875  ,  1.01106117,  0.97676975,
         1.01969389,  1.00050013],
       [-2.5869495 ,  0.19860645, -0.82331486, -0.98905984, -1.02378273,
        -0.98068647, -0.99950012],
       [ 0.38655567, -1.24220208, -0.08288218,  1.01106117,  0.97676975,
         1.01969389,  1.00050013],
       [ 0.38655567,  1.81951605,  1.7681995 ,  1.01106117,  0.97676975,
         1.01969389,  1.00050013],
       [ 0.38655567,  0.82896018, -0.45309852,  1.01106117,  0.97676975,
         1.01969389,  1.00050013]])

In [22]:
X_test[0:5]

array([[ 0.38655567, -0.34169675,  0.65755049, -0.98905984, -1.02378273,
         1.01969389, -0.99950012],
       [ 0.38655567, -0.79194941, -0.26799035,  1.01106117,  0.97676975,
         1.01969389,  1.00050013],
       [ 0.38655567, -1.51235368, -1.00842302, -0.98905984, -1.02378273,
        -0.98068647, -0.99950012],
       [ 0.38655567, -0.79194941,  0.47244232, -0.98905984, -1.02378273,
         1.01969389,  1.00050013],
       [ 0.38655567, -0.88199995, -0.63820669, -0.98905984, -1.02378273,
        -0.98068647, -0.99950012]])

### Decision Tree Model

### Implement a Decision Tree classifier using DecisionTreeClassifier from scikit-learn.

In [23]:
from sklearn.tree import DecisionTreeClassifier

### Perform Hyperparameter Optimization (HPO) using GridSearchCV to find the best hyperparameters.

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
dt_grid = DecisionTreeClassifier()

In [26]:
dt_params = {
            'max_depth' : [None,5,10,15],
             'min_samples_split':[2,5,10],
             'min_samples_leaf':[1,2,4] 
}

dt_grid_model = GridSearchCV(dt_grid,dt_params, cv=5, scoring='accuracy')

### Train the Decision Tree model on the training set.

In [27]:
dt_grid_model.fit(X_train,y_train)

### Print the best parameters and the corresponding cross-validation accuracy.

In [28]:
print("Best Parameters for Decision Tree Classifier using Grid Search :", dt_grid_model.best_params_)
print("Best Score for Decision Tree Classifier using Grid Search :", dt_grid_model.best_score_)

Best Parameters for Decision Tree Classifier using Grid Search : {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best Score for Decision Tree Classifier using Grid Search : 0.9717500000000001


### Make predictions on the testing set.

In [29]:
pred_dt = dt_grid_model.predict(X_test)

In [30]:
pred_dt

array([0, 1, 0, ..., 1, 1, 0], dtype=int64)

### Evaluate the performance using accuracy, precision, recall, and confusion matrix.

In [31]:
from sklearn.metrics import accuracy_score,precision_score,recall_score, classification_report, confusion_matrix

In [32]:
accuracy_score(y_test, pred_dt)

0.965034965034965

In [33]:
precision_score(y_test, pred_dt)

0.9935897435897436

In [34]:
recall_score(y_test, pred_dt)

0.9356136820925554

In [35]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.94      0.99      0.97       504
           1       0.99      0.94      0.96       497

    accuracy                           0.97      1001
   macro avg       0.97      0.96      0.96      1001
weighted avg       0.97      0.97      0.96      1001



In [36]:
confusion_matrix(y_test,pred_dt)

array([[501,   3],
       [ 32, 465]], dtype=int64)

### Random Forest Model:


Implement a Random Forest classifier using RandomForestClassifier from scikit-learn.



In [37]:
from sklearn.ensemble import RandomForestClassifier

### Perform Hyperparameter Optimization (HPO) using GridSearchCV to find the best hyperparameters.
Print the best parameters and the corresponding cross-validation accuracy.

In [38]:
rf_grid = RandomForestClassifier()

In [39]:
rf_params = {'n_estimators': [50, 100, 200],
            'max_depth': [None,5,10,15],
            'min_samples_split':[2,5,10],
            'min_samples_leaf':[1,2,4] 
            
}

rf_grid_model = GridSearchCV(rf_grid,rf_params, cv=5, scoring='accuracy')

### Train the Random Forest model on the training set.


In [40]:
rf_grid_model.fit(X_train,y_train)

In [41]:
print("Best Parameters for Random Forest Classifier using Grid Search :", rf_grid_model.best_params_)
print("Best Score for Random Forest Classifier using Grid Search :", rf_grid_model.best_score_)

Best Parameters for Random Forest Classifier using Grid Search : {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Score for Random Forest Classifier using Grid Search : 0.976


### Make predictions on the testing set.

In [42]:
pred_rf = rf_grid_model.predict(X_test)

In [43]:
pred_rf

array([0, 1, 0, ..., 1, 1, 0], dtype=int64)

### Evaluate the performance using accuracy, precision, recall, and confusion matrix.

In [44]:
accuracy_score(y_test, pred_rf)

0.9760239760239761

In [45]:
precision_score(y_test, pred_rf)

0.9937369519832986

In [46]:
recall_score(y_test, pred_dt)

0.9356136820925554

In [47]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       504
           1       0.99      0.96      0.98       497

    accuracy                           0.98      1001
   macro avg       0.98      0.98      0.98      1001
weighted avg       0.98      0.98      0.98      1001



In [48]:
confusion_matrix(y_test,pred_rf)

array([[501,   3],
       [ 21, 476]], dtype=int64)

### Comparison and Analysis:


Compare the performance of the Decision Tree and Random Forest models.

In [49]:
print("Best Parameters for Decision Tree Classifier using Grid Search :", dt_grid_model.best_params_)
print("Best Score for Decision Tree Classifier using Grid Search :", dt_grid_model.best_score_)
print('\n')
print("Best Parameters for Random Forest Classifier using Grid Search :", rf_grid_model.best_params_)
print("Best Score for Random Forest Classifier using Grid Search :", rf_grid_model.best_score_)


Best Parameters for Decision Tree Classifier using Grid Search : {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best Score for Decision Tree Classifier using Grid Search : 0.9717500000000001


Best Parameters for Random Forest Classifier using Grid Search : {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Score for Random Forest Classifier using Grid Search : 0.976


In [50]:
print('Accuracy for Decision Tree Classifier :',accuracy_score(y_test, pred_dt))
print('Accuracy for Random Forest Classifier :',accuracy_score(y_test, pred_rf))

Accuracy for Decision Tree Classifier : 0.965034965034965
Accuracy for Random Forest Classifier : 0.9760239760239761


In [51]:
print('Precision Score for Decision Tree Classifier :',precision_score(y_test, pred_dt))
print('Precision Score for Random Forest Classifier :',precision_score(y_test, pred_rf))

Precision Score for Decision Tree Classifier : 0.9935897435897436
Precision Score for Random Forest Classifier : 0.9937369519832986


In [52]:
print('Recall Score for Decision Tree Classifier :',recall_score(y_test, pred_dt))
print('Recall Score for Random Forest Classifier :',recall_score(y_test, pred_rf))

Recall Score for Decision Tree Classifier : 0.9356136820925554
Recall Score for Random Forest Classifier : 0.9577464788732394


In [53]:
print('Confusion Matrix of Decision Tree Classifier\n',confusion_matrix(y_test,pred_dt))
print('Confusion Matrix of Random Forest Classifier\n',confusion_matrix(y_test,pred_rf))

Confusion Matrix of Decision Tree Classifier
 [[501   3]
 [ 32 465]]
Confusion Matrix of Random Forest Classifier
 [[501   3]
 [ 21 476]]


### Random Forest Classiier is a better model compare to Decision Tree Classifier

### Provide insights into the advantages and disadvantages of each model.

### Visualize and discuss the decision boundaries of the models if possible.