### Libraries

In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Dataset

#### Loading dataset:

In [3]:
df = pd.read_csv('load_breast_cancer.csv')

#### Data Analysis:

In [4]:
df.shape

(569, 31)

In [5]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
df.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [8]:
df.dtypes

diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst

In [9]:
# Set of all unique classes in the dataset
classes = set(df.diagnosis)
print("classes:", classes)

# Calculating the number of unique classes in the dataset
ny = len(classes)
print("Number of outputs (per example): ny =", ny)
print("It's a", ("binary" if ny==2 else "multi-class"), "classification problem.")

classes: {'M', 'B'}
Number of outputs (per example): ny = 2
It's a binary classification problem.


### Data Preprocessing

In [10]:
# X = features
X = df.drop('diagnosis', axis=1)
# y = labels
Y = df['diagnosis']
print("X:", X.shape)
print("y:", Y.shape)

X: (569, 30)
y: (569,)


In [11]:
#splitting the model into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y, test_size=0.30, 
                                                    random_state=101)

### Model

In [12]:
#training a logistics regression model
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
Accuracy_logistics_regression =  (round(accuracy_score(y_test,predictions),2))
print("Accuracy(in %):", Accuracy_logistics_regression)

Accuracy(in %): 0.92


defining various steps required for the genetic algorithm

In [13]:
def initilization_of_population(size,n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat,dtype=np.bool)
        chromosome[:int(0.3*n_feat)]=False
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population

In [14]:
def fitness_score(population):
    scores = []
    for chromosome in population:
        logmodel.fit(X_train.iloc[:,chromosome],y_train)
        predictions = logmodel.predict(X_test.iloc[:,chromosome])
        scores.append(accuracy_score(y_test,predictions))
    scores, population = np.array(scores), np.array(population) 
    inds = np.argsort(scores)
    return list(scores[inds][::-1]), list(population[inds,:][::-1])

In [15]:
def selection(pop_after_fit,n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen

In [16]:
def crossover(pop_after_sel):
    population_nextgen=pop_after_sel
    for i in range(len(pop_after_sel)):
        child=pop_after_sel[i]
        child[3:7]=pop_after_sel[(i+1)%len(pop_after_sel)][3:7]
        population_nextgen.append(child)
    return population_nextgen

In [17]:
def mutation(pop_after_cross,mutation_rate):
    population_nextgen = []
    for i in range(0,len(pop_after_cross)):
        chromosome = pop_after_cross[i]
        for j in range(len(chromosome)):
            if random.random() < mutation_rate:
                chromosome[j]= not chromosome[j]
        population_nextgen.append(chromosome)
    #print(population_nextgen)
    return population_nextgen

In [18]:
def generations(size,n_feat,n_parents,mutation_rate,n_gen,X_train,
                                   X_test, y_train, y_test):
    best_chromo= []
    best_score= []
    population_nextgen=initilization_of_population(size,n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        print(scores[:2])
        pop_after_sel = selection(pop_after_fit,n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross,mutation_rate)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
    return best_chromo,best_score

In [19]:
chromo,score=generations(size=200,n_feat=30,n_parents=100,mutation_rate=0.10,
                     n_gen=38,X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test)
logmodel.fit(X_train.iloc[:,chromo[-1]],y_train)
predictions = logmodel.predict(X_test.iloc[:,chromo[-1]])

[0.9532163742690059, 0.9532163742690059]
[0.9532163742690059, 0.9532163742690059]
[0.9590643274853801, 0.9590643274853801]
[0.9590643274853801, 0.9590643274853801]
[0.9532163742690059, 0.9532163742690059]
[0.9532163742690059, 0.9532163742690059]
[0.9707602339181286, 0.9707602339181286]
[0.9590643274853801, 0.9590643274853801]
[0.9532163742690059, 0.9532163742690059]
[0.9532163742690059, 0.9532163742690059]
[0.9532163742690059, 0.9532163742690059]
[0.9590643274853801, 0.9590643274853801]
[0.9707602339181286, 0.9707602339181286]
[0.9649122807017544, 0.9649122807017544]
[0.9532163742690059, 0.9532163742690059]
[0.9590643274853801, 0.9590643274853801]
[0.9707602339181286, 0.9707602339181286]
[0.9590643274853801, 0.9590643274853801]
[0.9590643274853801, 0.9590643274853801]
[0.9532163742690059, 0.9532163742690059]
[0.9532163742690059, 0.9532163742690059]
[0.9590643274853801, 0.9590643274853801]
[0.9590643274853801, 0.9590643274853801]
[0.9532163742690059, 0.9532163742690059]
[0.9590643274853

In [20]:
Accuracy_Genetic_logistics_regression=round(accuracy_score(y_test,predictions),2)
print("Accuracy score after genetic algorithm is(in %): ",Accuracy_Genetic_logistics_regression)

Accuracy score after genetic algorithm is(in %):  0.95


# Results

In [1]:
#!pip install prettytable

Defaulting to user installation because normal site-packages is not writeable
Collecting prettytable
  Using cached prettytable-3.5.0-py3-none-any.whl (26 kB)
Installing collected packages: prettytable
Successfully installed prettytable-3.5.0


In [22]:
import prettytable as ptbl
tbl = ptbl.PrettyTable(field_names=["--","logistics regression","Genetic logistics regression"])
data1 = [Accuracy_logistics_regression]

data2 = [Accuracy_Genetic_logistics_regression]

names = ["Accuracy"]

for d2,d1, n in zip(data2,data1, names):
    tbl.add_row([n, d1, d2])
    
tbl.align["--","logistics regression","Genetic logistics regression"] = "1"

print(tbl)

+----------+----------------------+------------------------------+
|    --    | logistics regression | Genetic logistics regression |
+----------+----------------------+------------------------------+
| Accuracy |         0.92         |             0.95             |
+----------+----------------------+------------------------------+
