In [30]:
#basics
import pandas as pd
import numpy as np
from numpy import mean
# Graphs libraries
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
plt.style.use('seaborn-white')
import seaborn as sns
from IPython.display import Markdown, display
#import plotly.offline as py
#py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
from plotly import tools

from collections import Counter
from tqdm import tqdm
#Fairlearn and imblearn 
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
#scklearn 
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, recall_score, precision_score
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split as tts
from fairlearn.metrics import MetricFrame

import fairlearn
#AIF360
from aif360.datasets import StandardDataset
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from aif360.sklearn.metrics import equal_opportunity_difference
from aif360.algorithms.preprocessing import Reweighing
from aif360.algorithms.preprocessing.lfr import LFR
from aif360.algorithms.preprocessing import DisparateImpactRemover

In [31]:
def fair_metrics(dataset, y_pred):
    dataset_pred = dataset.copy()
    dataset_pred.labels = y_pred
        
    attr = dataset_pred.protected_attribute_names[0]
    
    idx = dataset_pred.protected_attribute_names.index(attr)
    privileged_groups =  [{attr:dataset_pred.privileged_protected_attributes[idx][0]}] 
    unprivileged_groups = [{attr:dataset_pred.unprivileged_protected_attributes[idx][0]}] 

    classified_metric = ClassificationMetric(dataset, dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    metric_pred = BinaryLabelDatasetMetric(dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    result = {'statistical_parity_difference': metric_pred.statistical_parity_difference(),
             'disparate_impact': metric_pred.disparate_impact(),
             'equal_opportunity_difference': classified_metric.equal_opportunity_difference()}
        
    return result

In [32]:
# define dataset
data = pd.read_csv('dataset_ready2.csv') 
data = data.set_index('ORG_org_uuid')

#More cleaning
data = data.drop(['Unnamed: 0'], axis = 1)
data_orig = data.copy()
#data = data.drop(data[data.unknown_founders != 0].index)
data = data.query('unknown_founders == 0 and female_founders != 0 or male_founders != 0')

data = data.drop(data[data.total_num_founders > 10].index)
data = data.drop(data[data.status == 'operating'].index)
#data.info()

In [33]:
cols_to_drop = ['company_name', 'category_list', 'category_group_list',
               'region', 'city', 'homepage_url', 'founded_on']
gender_cols = ['female_founders','male_founders', 'unknown_founders']
data.drop(columns = cols_to_drop, inplace = True)
#data_bef_scale = data.copy()
## CREATE A BINARY OUTCOME VARIABLE
data.replace('ipo', 1, regex = True, inplace = True)
data.replace('acquired', 1, regex = True, inplace = True)
data.replace('closed', 0, regex = True, inplace = True)


In [34]:
X_train, X_test, y_train, y_test = tts(data, data.status, test_size=0.20,random_state = 42)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
X_test_standard_gen = X_test.copy() #to use for standard dataset where all variables are included, also y 
X_train_standard_gen = X_train.copy() #to use for standard dataset where all variables are included, also y 
#gendered data train and test 
X_train_gen = X_train.drop('status', axis = 1)
X_test_gen = X_test.drop('status', axis = 1)
#no gender data train and test 
X_train_standard = X_train_standard_gen.drop(columns = gender_cols, axis = 1)
X_test_standard = X_test_standard_gen.drop(columns = gender_cols, axis = 1)

X_train = X_train_gen.drop(columns = gender_cols, axis = 1)
X_test = X_test_gen.drop(columns = gender_cols, axis = 1)
#print("Sanity checks", X_train.columns, X_train_gen.columns)
#X_train_standard_gen

In [35]:
#create as standard datasets to use for fairness metrics 
dataset_gen = StandardDataset(X_test_standard_gen, 
                          label_name='status', 
                          favorable_classes=[1], 
                          protected_attribute_names=['mostly_male_founders'], 
                          privileged_classes=[[1]])

dataset_standard = StandardDataset(X_test_standard, 
                          label_name='status', 
                          favorable_classes=[1], 
                          protected_attribute_names=['mostly_male_founders'], 
                          privileged_classes=[[1]])

In [7]:
#Define the scales of the pipeline 
pipeline = imbpipeline(steps = [['scaler', MinMaxScaler()],
                                ['logreg', LogisticRegression(solver = 'liblinear',random_state=42, max_iter=1000)]])

stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=42)

param_grid = {'logreg__C':np.logspace(-4,4,4)}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

In [12]:
#Fit and return results 
def train_and_results(X_train_input, y_train_input, X_test_input, y_test_input):
    
    
    fit = grid_search.fit(X_train_input, y_train_input)
    y_pred = grid_search.best_estimator_.predict(X_test_input)
    #cv_score = grid_search.best_estimator_.score()
    f1 = f1_score(y_test_input, y_pred)
    acc = accuracy_score(y_test_input, y_pred)
    recall = recall_score(y_test_input, y_pred)
    precision = precision_score(y_test_input, y_pred)
    score_list = [[f1, acc, recall, precision]]

    
    cols = ['f1 score', 'accuracy', 'recall', 'precision']
    datafr = pd.DataFrame(score_list, columns = cols)
    return datafr

In [18]:
print("With gender variables")
train_and_results(X_train_gen, y_train, X_test_gen, y_test)

With gender variables


Unnamed: 0,f1 score,accuracy,recall,precision
0,0.87117,0.79399,0.95615,0.800062


In [26]:
#Fit and return results 

privileged_groups = [{'mostly_male_founders': 1}]
unprivileged_groups = [{'mostly_male_founders': 0}]

def train_and_results_reweigh(X_train_input, y_train_input, X_test_input, y_test_input, stand):
    RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
    
    fit_rw = RW.fit(stand)
    trans_rw = RW.transform(stand)
    
    fit = grid_search.fit(trans_rw, y_train_input)
    y_pred = grid_search.best_estimator_.predict(X_test_input)
    #cv_score = grid_search.best_estimator_.score()
    f1 = f1_score(y_test_input, y_pred)
    acc = accuracy_score(y_test_input, y_pred)
    recall = recall_score(y_test_input, y_pred)
    precision = precision_score(y_test_input, y_pred)
    score_list = [[f1, acc, recall, precision]]

    fair = fair_metrics(dataset_standard, y_pred)

    cols = ['f1 score', 'accuracy', 'recall', 'precision']
    datafr = pd.DataFrame(score_list, columns = cols)
    return fair
    #return datafr

In [36]:

train_and_results_reweigh(X_train_gen, y_train, X_test_gen, y_test, dataset_gen)

TypeError: Singleton array array(                                     instance weights             features  \
                                                                             
                                                      city_success_ranking   
instance names                                                               
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b         0.990611             0.651971   
13f53f5c-fad1-5936-6f5c-68907a011036         0.990611             0.686516   
5814b0d9-8aef-8a3a-6d26-963abd6faabf         1.026091             0.000000   
8dc15452-ce57-21ae-ac8b-6270ff8bb627         0.816736             0.691043   
61ccd47f-a40f-30e8-4cc2-3baf43e98093         0.990611             0.716325   
...                                               ...                  ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20         0.990611             0.733041   
0737fe45-47f5-8001-37f9-9e2a742e7072         0.990611             0.687422   
4f80c2a6-491a-b835-385b-6c3dfc672a70         0.990611             0.716186   
4616010d-edb9-0e16-451b-2139727c86df         0.990611             0.747040   
4cd27cf5-5b0c-a432-9453-b899bde86913         1.026091             0.000000   

                                                                       \
                                                                        
                                     region_rank multiple_degrees_sum   
instance names                                                          
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b    0.491815                  0.0   
13f53f5c-fad1-5936-6f5c-68907a011036    0.625744                  0.0   
5814b0d9-8aef-8a3a-6d26-963abd6faabf    0.000000                  0.0   
8dc15452-ce57-21ae-ac8b-6270ff8bb627    0.000000                  0.0   
61ccd47f-a40f-30e8-4cc2-3baf43e98093    0.655506                  0.0   
...                                          ...                  ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20    0.732143                  0.0   
0737fe45-47f5-8001-37f9-9e2a742e7072    0.646577                  3.0   
4f80c2a6-491a-b835-385b-6c3dfc672a70    0.735863                  0.0   
4616010d-edb9-0e16-451b-2139727c86df    0.643601                  0.0   
4cd27cf5-5b0c-a432-9453-b899bde86913    0.000000                  0.0   

                                                               \
                                                                
                                     multiple_degrees_average   
instance names                                                  
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b                      0.0   
13f53f5c-fad1-5936-6f5c-68907a011036                      0.0   
5814b0d9-8aef-8a3a-6d26-963abd6faabf                      0.0   
8dc15452-ce57-21ae-ac8b-6270ff8bb627                      0.0   
61ccd47f-a40f-30e8-4cc2-3baf43e98093                      0.0   
...                                                       ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20                      0.0   
0737fe45-47f5-8001-37f9-9e2a742e7072                      1.0   
4f80c2a6-491a-b835-385b-6c3dfc672a70                      0.0   
4616010d-edb9-0e16-451b-2139727c86df                      0.0   
4cd27cf5-5b0c-a432-9453-b899bde86913                      0.0   

                                                                        \
                                                                         
                                     is_completed_sum is_completed_avg   
instance names                                                           
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b              0.0              0.0   
13f53f5c-fad1-5936-6f5c-68907a011036              0.0              0.0   
5814b0d9-8aef-8a3a-6d26-963abd6faabf              0.0              0.0   
8dc15452-ce57-21ae-ac8b-6270ff8bb627              0.0              0.0   
61ccd47f-a40f-30e8-4cc2-3baf43e98093              0.0              0.0   
...                                               ...              ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20              0.0              0.0   
0737fe45-47f5-8001-37f9-9e2a742e7072              3.0              1.0   
4f80c2a6-491a-b835-385b-6c3dfc672a70              0.0              0.0   
4616010d-edb9-0e16-451b-2139727c86df              0.0              0.0   
4cd27cf5-5b0c-a432-9453-b899bde86913              0.0              0.0   

                                                                              \
                                                                               
                                     work_experience_avg work_experience_sum   
instance names                                                                 
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b            0.000000                 0.0   
13f53f5c-fad1-5936-6f5c-68907a011036            0.000000                 0.0   
5814b0d9-8aef-8a3a-6d26-963abd6faabf            0.000000                 0.0   
8dc15452-ce57-21ae-ac8b-6270ff8bb627            0.000000                 0.0   
61ccd47f-a40f-30e8-4cc2-3baf43e98093            0.000000                 0.0   
...                                                  ...                 ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20            0.000000                 0.0   
0737fe45-47f5-8001-37f9-9e2a742e7072         1130.333333              3391.0   
4f80c2a6-491a-b835-385b-6c3dfc672a70            0.000000                 0.0   
4616010d-edb9-0e16-451b-2139727c86df            0.000000                 0.0   
4cd27cf5-5b0c-a432-9453-b899bde86913            0.000000                 0.0   

                                                         ...                   \
                                                         ...                    
                                     education_time_avg  ... unknown_founders   
instance names                                           ...                    
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b                0.0  ...              0.0   
13f53f5c-fad1-5936-6f5c-68907a011036                0.0  ...              0.0   
5814b0d9-8aef-8a3a-6d26-963abd6faabf                0.0  ...              0.0   
8dc15452-ce57-21ae-ac8b-6270ff8bb627                0.0  ...              0.0   
61ccd47f-a40f-30e8-4cc2-3baf43e98093                0.0  ...              0.0   
...                                                 ...  ...              ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20                0.0  ...              0.0   
0737fe45-47f5-8001-37f9-9e2a742e7072                0.0  ...              2.0   
4f80c2a6-491a-b835-385b-6c3dfc672a70                0.0  ...              0.0   
4616010d-edb9-0e16-451b-2139727c86df                0.0  ...              0.0   
4cd27cf5-5b0c-a432-9453-b899bde86913                0.0  ...              0.0   

                                                                        \
                                                                         
                                     total_num_founders category_list1   
instance names                                                           
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b                3.0          329.0   
13f53f5c-fad1-5936-6f5c-68907a011036                5.0          608.0   
5814b0d9-8aef-8a3a-6d26-963abd6faabf                1.0           -1.0   
8dc15452-ce57-21ae-ac8b-6270ff8bb627                1.0          449.0   
61ccd47f-a40f-30e8-4cc2-3baf43e98093                4.0           25.0   
...                                                 ...            ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20                5.0          229.0   
0737fe45-47f5-8001-37f9-9e2a742e7072                8.0          425.0   
4f80c2a6-491a-b835-385b-6c3dfc672a70                1.0          313.0   
4616010d-edb9-0e16-451b-2139727c86df                4.0          104.0   
4cd27cf5-5b0c-a432-9453-b899bde86913                1.0          204.0   

                                                                    \
                                                                     
                                     category_list2 category_list3   
instance names                                                       
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b          548.0        24600.0   
13f53f5c-fad1-5936-6f5c-68907a011036           -1.0           -1.0   
5814b0d9-8aef-8a3a-6d26-963abd6faabf           -1.0           -1.0   
8dc15452-ce57-21ae-ac8b-6270ff8bb627           -1.0           -1.0   
61ccd47f-a40f-30e8-4cc2-3baf43e98093           22.0        24120.0   
...                                             ...            ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20          453.0           -1.0   
0737fe45-47f5-8001-37f9-9e2a742e7072          550.0        24665.0   
4f80c2a6-491a-b835-385b-6c3dfc672a70          409.0        17913.0   
4616010d-edb9-0e16-451b-2139727c86df          100.0        24120.0   
4cd27cf5-5b0c-a432-9453-b899bde86913          246.0        11634.0   

                                                           \
                                                            
                                     category_group_list1   
instance names                                              
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b                 42.0   
13f53f5c-fad1-5936-6f5c-68907a011036                 38.0   
5814b0d9-8aef-8a3a-6d26-963abd6faabf                 -1.0   
8dc15452-ce57-21ae-ac8b-6270ff8bb627                 11.0   
61ccd47f-a40f-30e8-4cc2-3baf43e98093                 12.0   
...                                                   ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20                 15.0   
0737fe45-47f5-8001-37f9-9e2a742e7072                  7.0   
4f80c2a6-491a-b835-385b-6c3dfc672a70                 22.0   
4616010d-edb9-0e16-451b-2139727c86df                 21.0   
4cd27cf5-5b0c-a432-9453-b899bde86913                  6.0   

                                                           \
                                                            
                                     category_group_list2   
instance names                                              
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b                 -1.0   
13f53f5c-fad1-5936-6f5c-68907a011036                 -1.0   
5814b0d9-8aef-8a3a-6d26-963abd6faabf                 -1.0   
8dc15452-ce57-21ae-ac8b-6270ff8bb627                 26.0   
61ccd47f-a40f-30e8-4cc2-3baf43e98093                 16.0   
...                                                   ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20                 30.0   
0737fe45-47f5-8001-37f9-9e2a742e7072                 26.0   
4f80c2a6-491a-b835-385b-6c3dfc672a70                 -1.0   
4616010d-edb9-0e16-451b-2139727c86df                 22.0   
4cd27cf5-5b0c-a432-9453-b899bde86913                  6.0   

                                                           \
                                                            
                                     category_group_list3   
instance names                                              
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b                 -1.0   
13f53f5c-fad1-5936-6f5c-68907a011036                 -1.0   
5814b0d9-8aef-8a3a-6d26-963abd6faabf                 -1.0   
8dc15452-ce57-21ae-ac8b-6270ff8bb627                 -1.0   
61ccd47f-a40f-30e8-4cc2-3baf43e98093               5614.0   
...                                                   ...   
cf68f7a1-f32d-005f-9f67-28c4ec39ca20                 -1.0   
0737fe45-47f5-8001-37f9-9e2a742e7072               6297.0   
4f80c2a6-491a-b835-385b-6c3dfc672a70                 -1.0   
4616010d-edb9-0e16-451b-2139727c86df               5513.0   
4cd27cf5-5b0c-a432-9453-b899bde86913                294.0   

                                                          labels  
                                      protected attribute         
                                     mostly_male_founders         
instance names                                                    
7ad1d0d8-7611-4c87-b8bc-4f123a6f619b                  1.0    1.0  
13f53f5c-fad1-5936-6f5c-68907a011036                  1.0    1.0  
5814b0d9-8aef-8a3a-6d26-963abd6faabf                  1.0    0.0  
8dc15452-ce57-21ae-ac8b-6270ff8bb627                  0.0    0.0  
61ccd47f-a40f-30e8-4cc2-3baf43e98093                  1.0    1.0  
...                                                   ...    ...  
cf68f7a1-f32d-005f-9f67-28c4ec39ca20                  1.0    1.0  
0737fe45-47f5-8001-37f9-9e2a742e7072                  1.0    1.0  
4f80c2a6-491a-b835-385b-6c3dfc672a70                  1.0    1.0  
4616010d-edb9-0e16-451b-2139727c86df                  1.0    1.0  
4cd27cf5-5b0c-a432-9453-b899bde86913                  1.0    0.0  

[3694 rows x 23 columns], dtype=object) cannot be considered a valid collection.

With gender variables


Unnamed: 0,f1 score,accuracy,recall,precision
0,0.87117,0.79399,0.95615,0.800062


In [None]:
fit_gender = grid_search.fit(X_train_gen, y_train)
y_pred_gender = grid_search.best_estimator_.predict(X_test_gen)

fair_metrics(dataset_gen, y_pred_gender)


RW = Reweighing(unprivileged_groups=unprivileged_groups,
               privileged_groups=privileged_groups)
RW.fit(dataset_orig_train)
dataset_transf_train = RW.transform(dataset_orig_train)

In [None]:
gm1.overall, gm1.by_group

In [None]:
fairlearn.metrics.demographic_parity_difference(y_true=y_test, y_pred=y_pred_normal, sensitive_features = X_test.mostly_male_founders)

In [None]:


gm1 = MetricFrame(metrics=accuracy_score, y_true=y_test, y_pred=y_pred, sensitive_features = X_test.mostly_male_founders)
print("Accuracy overall \n", gm1.overall, "\n")
print("Accuracy by group",
      "\n 0 mostly female, 1 mostly male \n",
      "\n", gm1.by_group)






In [None]:
mod = grid_search.best_estimator_.named_steps.logreg.coef_
mod

In [None]:
data_x = data.drop('status', axis = 1)
importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': mod[0]
})
importances = importances.sort_values(by='Importance', ascending=False)
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()
print(importances)




In [None]:
[0.001, 0.01, 0.1, 1, 10, 100, 1000]

In [None]:

param_grid = {
    "pca__n_components": [5, 15, 30, 45, 60],
    "logistic__C": np.logspace(-4, 4, 4),
}
search = GridSearchCV(pipe, param_grid, n_jobs=2)
search.fit(X_digits, y_digits)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

