In [1]:
import pandas as pd

DATA = '/kaggle/input/gender-classification-dataset/Gender_Classification_Data.csv'

df = pd.read_csv(filepath_or_buffer=DATA)
df.head()

Unnamed: 0,gender,height,weight,age
0,female,166.835558,64.238722,38
1,female,166.321934,67.165489,65
2,male,193.424782,89.342636,67
3,female,167.027904,67.637666,55
4,male,191.822253,93.985117,29


In [2]:
from plotly import express
import warnings

warnings.filterwarnings(action='ignore', category=FutureWarning)

express.scatter(data_frame=df, x='height', y='weight', color='gender', size='age', height=800).update_traces(marker={'opacity': 0.25})

In our dataset women are generally smaller than men: they are shorter and weigh less.

We don't have a lot of overlap between the two sets so it should be easy to build a fairly accurate classifier. Let's build a simple logistic regression model and be done.

In [3]:
from plotly import express

express.histogram(data_frame=df, x='gender', color='gender')

Our target class is balanced, which is good news.

In [4]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

COLUMNS = ['age', 'height', 'weight']
TARGET = 'gender'
X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.20, random_state=2024, )

time_start = arrow.now()
regression = LogisticRegression(max_iter=100000, tol=1e-8).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(regression.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=regression.predict(X=X_test))))
print('model done in {}'.format(arrow.now() - time_start))

model fit in 45 iterations took 0:00:00.062391
accuracy: 0.9805
model done in 0:00:00.073418


In [5]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=regression.predict(X=X_test)))

              precision    recall  f1-score   support

      female       0.98      0.98      0.98      1038
        male       0.98      0.98      0.98       962

    accuracy                           0.98      2000
   macro avg       0.98      0.98      0.98      2000
weighted avg       0.98      0.98      0.98      2000

