In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from fairlearn.metrics import MetricFrame
from imblearn.over_sampling import SMOTE
import seaborn as sns


# define dataset
data_df = pd.read_csv('bin_gender_ex_operating_data.csv') 
data_df = data_df.set_index('ORG_org_uuid')


In [12]:
no_tot_companies = len(data_df)
no_female_comp = len(data_df.query('mostly_male_founders == 0'))
no_male_comp = len(data_df.query('mostly_male_founders == 1'))
no_tot_closed = len(data_df.query('status == 0'))

print("Total no of companies: ", no_tot_companies)
print("Total no of closed companies: ", no_tot_closed)
print("Proportion of total closed (label 0): ", no_tot_closed/no_tot_companies)

Total no of companies:  18493
Total no of closed companies:  4767
Proportion of total closed (label 0):  0.2577732114854269


In [None]:
oversample = SMOTE()
data_ready, data_ready.mostly_male_founders = oversample.fit_resample(data_ready, data_ready.mostly_male_founders)

#create the y variable 
y_variable = data_ready.status
#set org uuid as the index 
#Drop unnecessary columns 

## CREATE A BINARY OUTCOME VARIABLE
#y_variable_bin = y_variable.replace('operating', 1, regex = True)

X = data_ready
y = y_variable 

In [None]:
female_closed = data_df.query('mostly_male_founders == 0 and status == 0')
male_closed = data_df.query('mostly_male_founders == 1 and status == 0')

print(len(female_closed))
print(len(male_closed))

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)



In [None]:
#Want to oversample mostly_male_founders class 0 

In [None]:
print(sum(X_train.mostly_male_founders == 0))
print(sum(X_train.mostly_male_founders == 1))

In [None]:
X_train = X_train.drop(['status'], axis = 1)
X_test = X_test.drop(['status'], axis = 1)

pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=42)],
                                ['scaler', MinMaxScaler()],
                                ['svd', TruncatedSVD(n_components = 10)],
                                ['classifier', LogisticRegression(random_state=42, max_iter=1000)]])

stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
    
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

print(classification_report(y_test,grid_search.best_estimator_.predict(X_test)))

y_pred = grid_search.best_estimator_.predict(X_test)


gm1 = MetricFrame(metrics=accuracy_score, y_true=y_test, y_pred=y_pred, sensitive_features = X_test.mostly_male_founders)
print("Accuracy overall \n", gm1.overall, "\n")
print("Accuracy by group",
      "\n 0 mostly female, 1 mostly male \n",
      "\n", gm1.by_group)




In [None]:
print(confusion_matrix(y_test, y_pred))