## 4. Data improvement - balancing sets

In [1]:
import pandas as pd

data = pd.read_csv('data/StudentPerformanceFactors.csv')

X = data.drop("Gender", axis=1)
Y = data['Gender']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

#### Balancing sets
Using ```SMOTE``` for oversampling and ```TomekLinks``` for undersampling.

I chose Support Vector Classification as the model.

In [2]:
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks  
from imblearn.pipeline import Pipeline as ImPipeline

smote = SMOTE(random_state=42)
tl = TomekLinks(sampling_strategy='majority')

svc = SVC()
svc_oversampled = SVC()
svc_undersampled = SVC()

svc_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', svc)
])

svc_pipeline_oversampled = ImPipeline([
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('regressor', svc_oversampled)
])

svc_pipeline_undersampled = ImPipeline([
    ('preprocessor', preprocessor),
    ('tl', tl),
    ('regressor', svc_undersampled)
])

svc_pipeline.fit(X_train, Y_train)
svc_pipeline_oversampled.fit(X_train, Y_train)
svc_pipeline_undersampled.fit(X_train, Y_train)

y_pred = svc_pipeline.predict(X_test)
y_pred_oversampled = svc_pipeline_oversampled.predict(X_test)
y_pred_undersampled = svc_pipeline_undersampled.predict(X_test)

#### Evaluation

In [4]:
from sklearn.metrics import classification_report

print('Original data')
print(classification_report(Y_test, y_pred))
print('Oversampled data')
print(classification_report(Y_test, y_pred_oversampled))
print('Undersampled data')
print(classification_report(Y_test, y_pred_undersampled))

Original data
              precision    recall  f1-score   support

      Female       0.40      0.02      0.04       710
        Male       0.57      0.97      0.72       942

    accuracy                           0.57      1652
   macro avg       0.48      0.50      0.38      1652
weighted avg       0.50      0.57      0.43      1652

Oversampled data
              precision    recall  f1-score   support

      Female       0.44      0.46      0.45       710
        Male       0.58      0.57      0.58       942

    accuracy                           0.52      1652
   macro avg       0.51      0.51      0.51      1652
weighted avg       0.52      0.52      0.52      1652

Undersampled data
              precision    recall  f1-score   support

      Female       0.44      0.30      0.35       710
        Male       0.57      0.71      0.63       942

    accuracy                           0.53      1652
   macro avg       0.50      0.50      0.49      1652
weighted avg       0.51  

#### Same balancing for logistic regression using gradient descent

In [34]:
from src.linear_regression.models import LogisticRegressionGradientDescent

lr = LogisticRegressionGradientDescent()
lr_oversampled = LogisticRegressionGradientDescent()
lr_undersampled = LogisticRegressionGradientDescent()

lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lr)
])

lr_pipeline_oversampled = ImPipeline([
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('regressor', lr_oversampled)
])

lr_pipeline_undersampled = ImPipeline([
    ('preprocessor', preprocessor),
    ('tl', tl),
    ('regressor', lr_undersampled)
])

lr_pipeline.fit(X_train, Y_train)
lr_pipeline_oversampled.fit(X_train, Y_train)
lr_pipeline_undersampled.fit(X_train, Y_train)

y_pred_lr = lr_pipeline.predict(X_test)
y_pred_lr_oversampled = lr_pipeline_oversampled.predict(X_test)
y_pred_lr_undersampled = lr_pipeline_undersampled.predict(X_test)

#### Evaluation

In [35]:
print('Original data')
print(classification_report(Y_test, y_pred_lr))
print('Oversampled data')
print(classification_report(Y_test, y_pred_lr_oversampled))
print('Undersampled data')
print(classification_report(Y_test, y_pred_lr_undersampled))

Original data
              precision    recall  f1-score   support

      Female       0.44      0.01      0.02       710
        Male       0.57      0.99      0.72       942

    accuracy                           0.57      1652
   macro avg       0.50      0.50      0.37      1652
weighted avg       0.51      0.57      0.42      1652

Oversampled data
              precision    recall  f1-score   support

      Female       0.42      0.50      0.46       710
        Male       0.56      0.48      0.51       942

    accuracy                           0.49      1652
   macro avg       0.49      0.49      0.49      1652
weighted avg       0.50      0.49      0.49      1652

Undersampled data
              precision    recall  f1-score   support

      Female       0.42      0.11      0.18       710
        Male       0.57      0.88      0.69       942

    accuracy                           0.55      1652
   macro avg       0.50      0.50      0.44      1652
weighted avg       0.51  