In [1]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder

In [45]:
# Load the dataset
df = pd.read_csv('dataset/credit_customers.csv')
# df.head().T
# df.info()
# print(df['class'].value_counts())
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
credit_amount,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
installment_commitment,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
residence_since,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
existing_credits,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
num_dependents,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0


In [46]:
# Scale the data
df['duration'] = df['duration'] / 100
df['credit_amount'] = df['credit_amount'] / 100_000
df['age'] = df['age'] / 100

In [47]:
# Ordinal encoding
ord_enc = OrdinalEncoder()

df['checking_status'] = ord_enc.fit_transform(df[['checking_status']])
df['credit_history'] = ord_enc.fit_transform(df[['credit_history']])
df['purpose'] = ord_enc.fit_transform(df[['purpose']])
df['savings_status'] = ord_enc.fit_transform(df[['savings_status']])
df['employment'] = ord_enc.fit_transform(df[['employment']])
df['personal_status'] = ord_enc.fit_transform(df[['personal_status']])
df['other_parties'] = ord_enc.fit_transform(df[['other_parties']])
df['property_magnitude'] = ord_enc.fit_transform(df[['property_magnitude']])
df['other_payment_plans'] = ord_enc.fit_transform(df[['other_payment_plans']])
df['housing'] = ord_enc.fit_transform(df[['housing']])
df['job'] = ord_enc.fit_transform(df[['job']])
df['own_telephone'] = ord_enc.fit_transform(df[['own_telephone']])
df['foreign_worker'] = ord_enc.fit_transform(df[['foreign_worker']])
df['class'] = ord_enc.fit_transform(df[['class']])

df['class'] = df['class'].astype(int)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('class', axis=1), df['class'], test_size=0.2, random_state=42)

In [50]:
# Classification model
model = XGBClassifier(
    learning_rate=0.15,
    n_estimators=200,
    max_depth=4,
)

model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=50,
    eval_metric='error',
)

# Score
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions) * 100.0))

[0]	validation_0-error:0.24000
[1]	validation_0-error:0.24500
[2]	validation_0-error:0.23500
[3]	validation_0-error:0.23000
[4]	validation_0-error:0.23500
[5]	validation_0-error:0.23000
[6]	validation_0-error:0.23500
[7]	validation_0-error:0.23000
[8]	validation_0-error:0.22000
[9]	validation_0-error:0.23500
[10]	validation_0-error:0.22000
[11]	validation_0-error:0.23000
[12]	validation_0-error:0.23500
[13]	validation_0-error:0.21500
[14]	validation_0-error:0.22000
[15]	validation_0-error:0.22500
[16]	validation_0-error:0.23000
[17]	validation_0-error:0.23000
[18]	validation_0-error:0.22500
[19]	validation_0-error:0.22500
[20]	validation_0-error:0.22500
[21]	validation_0-error:0.22000
[22]	validation_0-error:0.21500
[23]	validation_0-error:0.20000
[24]	validation_0-error:0.21000
[25]	validation_0-error:0.21500
[26]	validation_0-error:0.20500
[27]	validation_0-error:0.21000
[28]	validation_0-error:0.21000
[29]	validation_0-error:0.20500
[30]	validation_0-error:0.20500
[31]	validation_0-

`eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.
`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.


[66]	validation_0-error:0.20000
[67]	validation_0-error:0.20000
[68]	validation_0-error:0.19500
[69]	validation_0-error:0.20000
[70]	validation_0-error:0.19500
[71]	validation_0-error:0.19500
[72]	validation_0-error:0.20500
[73]	validation_0-error:0.21000
[74]	validation_0-error:0.20000
[75]	validation_0-error:0.19500
[76]	validation_0-error:0.20000
[77]	validation_0-error:0.20000
[78]	validation_0-error:0.20500
[79]	validation_0-error:0.21000
[80]	validation_0-error:0.21000
[81]	validation_0-error:0.20500
[82]	validation_0-error:0.20000
[83]	validation_0-error:0.21500
[84]	validation_0-error:0.22000
[85]	validation_0-error:0.21000
[86]	validation_0-error:0.20000
Accuracy: 80.50%


In [51]:
# Classificaton using random forest
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions) * 100.0))

Accuracy: 74.50%


In [64]:
from evolutionary_forest.forest import EvolutionaryForestRegressor
from evolutionary_forest.utils import get_feature_importance, plot_feature_importance, feature_append

# Train Evolutionary Forest
r = EvolutionaryForestRegressor(max_height=5, select='AutomaticLexicase',
                                gene_num=10, boost_size=100, n_gen=20, n_pop=200, cross_pb=1,
                                base_learner='Random-DT', n_process=1)
r.fit(X_train, y_train)

# Predict
y_pred = r.predict(X_test)
predictions = [round(value) for value in y_pred]

print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions) * 100.0))

Accuracy: 73.50%


In [54]:
import random
import xgboost as xgb
from deap import creator, base, tools, algorithms
import numpy as np

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

# Attribute generator
toolbox.register("attr_float", random.random)

# Structure initializers
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, n=X_train.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

def evalOneMax(individual):
    # Apply weights to features
    weighted_train = X_train * np.array(individual)

    # Train XGBoost on weighted features
    clf = xgb.XGBClassifier()
    clf.fit(weighted_train, y_train)

    # Apply weights to validation features and score
    weighted_val = X_test * np.array(individual)
    score = clf.score(weighted_val, y_test)

    return score,

# Operator registering
toolbox.register("evaluate", evalOneMax)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.2, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=3)


pop = toolbox.population(n=100)
hof = tools.HallOfFame(10)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.3, ngen=20, 
                                stats=stats, halloffame=hof, verbose=True)

ga_best_weights = hof[0]
print(ga_best_weights)

A class named 'FitnessMax' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.
A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.


gen	nevals	avg  	std        	min  	max  
0  	100   	0.835	2.22045e-16	0.835	0.835
1  	65    	0.835	2.22045e-16	0.835	0.835
2  	69    	0.835	2.22045e-16	0.835	0.835
3  	56    	0.835	2.22045e-16	0.835	0.835
4  	74    	0.835	2.22045e-16	0.835	0.835
5  	57    	0.835	2.22045e-16	0.835	0.835
6  	64    	0.835	2.22045e-16	0.835	0.835
7  	74    	0.835	2.22045e-16	0.835	0.835
8  	61    	0.835	2.22045e-16	0.835	0.835
9  	69    	0.835	2.22045e-16	0.835	0.835
10 	66    	0.835	2.22045e-16	0.835	0.835
11 	53    	0.835	2.22045e-16	0.835	0.835
12 	56    	0.835	2.22045e-16	0.835	0.835
13 	66    	0.835	2.22045e-16	0.835	0.835
14 	78    	0.835	2.22045e-16	0.835	0.835
15 	64    	0.835	2.22045e-16	0.835	0.835
16 	64    	0.835	2.22045e-16	0.835	0.835
17 	61    	0.835	2.22045e-16	0.835	0.835
18 	60    	0.835	2.22045e-16	0.835	0.835
19 	64    	0.835	2.22045e-16	0.835	0.835
20 	66    	0.835	2.22045e-16	0.835	0.835
[0.02062175472247918, 0.3663328723872634, 0.24087760975981654, 0.35524038146365144, 0.083759026371

In [61]:
def get_confussion_matrix(y_test, y_pred):
    # Generate confusion matrix and classification report
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix: ")
    print(cm)

    cr = classification_report(y_test, y_pred)
    print("Classification Report: ")
    print(cr)

In [62]:
from sklearn.metrics import confusion_matrix, classification_report

# Apply weights to features
weighted_train = X_train * np.array(ga_best_weights)

# Weighted features
X_train_weighted = X_train * ga_best_weights
X_test_weighted = X_test * ga_best_weights

# Train XGBoost on weighted features
model_xgb = xgb.XGBClassifier()
model_xgb.fit(weighted_train, y_train)

# Evaluate on the validation set
accuracy = model_xgb.score(X_test_weighted, y_test)
print('Validation accuracy:', accuracy)

# Generate predictions
y_pred = model_xgb.predict(X_test_weighted)
get_confussion_matrix(y_test, y_pred)

Validation accuracy: 0.835
Confusion Matrix: 
[[ 37  22]
 [ 11 130]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.77      0.63      0.69        59
           1       0.86      0.92      0.89       141

    accuracy                           0.83       200
   macro avg       0.81      0.77      0.79       200
weighted avg       0.83      0.83      0.83       200



In [65]:
from evolutionary_forest.forest import EvolutionaryForestRegressor
from evolutionary_forest.utils import get_feature_importance, plot_feature_importance, feature_append

# Train Evolutionary Forest
r = EvolutionaryForestRegressor(max_height=5, select='AutomaticLexicase',
                                gene_num=10, boost_size=100, n_gen=20, n_pop=200, cross_pb=1,
                                base_learner='Random-DT', n_process=1, verbose=True)

# Weighted features
X_train_weighted = X_train * ga_best_weights
X_test_weighted = X_test * ga_best_weights

r.fit(X_train_weighted, y_train)

# Predict
y_pred = r.predict(X_test_weighted)
predictions = [round(value) for value in y_pred]

print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions) * 100.0))

get_confussion_matrix(y_test, predictions)

data shape (800, 20) (800,)
   	      	                                                          fitness                                                           	                                  size                                  
   	      	----------------------------------------------------------------------------------------------------------------------------	------------------------------------------------------------------------
gen	nevals	25%         	75%          	avg         	gen	max          	median       	min          	nevals	std         	25%	75%	avg 	gen	max	median	min	nevals	std     
0  	200   	[-0.8186002]	[-0.69241161]	[-0.7543765]	0  	[-0.47269502]	[-0.75773276]	[-1.05464708]	200   	[0.09704033]	3  	4  	3.79	0  	5  	4     	3  	200   	0.621208
defaultdict(<class 'int'>, {'1': 200})
P value of different population 0.19073033347156776
Mul(1, checking_status)
AQ(existing_credits, other_parties)
Sub(AQ(duration, residence_since), AQ(installment_commitment, employment)