In [8]:
import pandas as pd
df = pd.read_csv('bank.csv', header = None, 
                 names=['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','y'])


In [9]:
#drop campaign data
df.drop(df.iloc[:, 8:16], inplace = True, axis = 1) 

In [10]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [11]:
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

X_categoric = df.iloc[:, [1, 2, 3, 4, 6, 7]].values

encoded_data = ohe.fit_transform(X_categoric).toarray()
encoded_df = pd.DataFrame(encoded_data)
encoded_df.columns = ohe.get_feature_names()

from sklearn.preprocessing import StandardScaler
age_std_scale = StandardScaler()
balance_std_scale = StandardScaler()

numeric_data = df.iloc[:, [0, 5]].values
numeric_df = pd.DataFrame(numeric_data, dtype=object)
numeric_df.columns = ['age', 'balance']

numeric_df['age'] = age_std_scale.fit_transform(numeric_df[['age']])
numeric_df['balance'] = balance_std_scale.fit_transform(numeric_df[['balance']])

numeric_df.head()




Unnamed: 0,age,balance
0,-1.05627,0.121072
1,-0.772583,1.118644
2,-0.583458,-0.024144
3,-1.05627,0.017726
4,1.686036,-0.472753


In [12]:
X_final = pd.concat([numeric_df, encoded_df], axis = 1)

In [13]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=0)

In [14]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [15]:
y_pred = rfc.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[775,  18],
       [104,   8]], dtype=int64)

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8651933701657458

In [17]:
print(X_train.columns)

Index(['age', 'balance', 'x0_admin.', 'x0_blue-collar', 'x0_entrepreneur',
       'x0_housemaid', 'x0_management', 'x0_retired', 'x0_self-employed',
       'x0_services', 'x0_student', 'x0_technician', 'x0_unemployed',
       'x0_unknown', 'x1_divorced', 'x1_married', 'x1_single', 'x2_primary',
       'x2_secondary', 'x2_tertiary', 'x2_unknown', 'x3_no', 'x3_yes', 'x4_no',
       'x4_yes', 'x5_no', 'x5_yes'],
      dtype='object')


In [18]:
import numpy as np
x_age = np.array([30])
x_age = np.reshape(x_age, (1, -1))
x_age = np.array(age_std_scale.transform(x_age))

x_balance = np.array([1000])
x_balance = np.reshape(x_balance, (1, -1))
x_balance = np.array(balance_std_scale.transform(x_balance))

x_categoric = np.array(["student", "single", "tertiary", "no", "yes", "no" ])
x_categoric = np.reshape(x_categoric, (1, -1))
x_categoric = ohe.transform(x_categoric).toarray()

x_final = np.column_stack((x_age, x_balance, x_categoric))
x_final = pd.DataFrame(x_final, dtype=object)

x_final.head()




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,-1.05627,-0.14045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [19]:
rfc.predict(x_final)



array(['no'], dtype=object)

In [20]:
df.groupby('y').count()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
no,4000,4000,4000,4000,4000,4000,4000,4000
yes,521,521,521,521,521,521,521,521


In [21]:
y_fair = rfc.predict(X_final)
y_fair = pd.DataFrame(y_fair, dtype=object)
y_fair.head()

Unnamed: 0,0
0,no
1,no
2,no
3,no
4,no


In [22]:
csv_data = y_fair.to_csv('bankmarekting_fairness.csv', index = False) 
print(csv_data) 

None
