# Artificial and Computational Intelligence Assignment 1 - Q12 - Genetic Algorithm

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math

from sklearn.model_selection import cross_val_score
from sklearn import svm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#Load data as given in probem statement
def get_data():
    cols=['cough','fever','Red-eye','body','Divoc']
    data=[
    ['High','No','Yes','No','No'],
    ['No','High','No','No','No'],
    ['No','High','Yes','High','Yes'],
    ['Low','High','Yes','High','Yes'],
    ['Low','High','Yes','Low','Yes'],
    ['No','High','No','Low','Yes'],
    ['High','Low','Yes','Low','Yes'],
    ['No','No','No','High','No'],
    ['No','Low','Yes','High','Yes'],
    ['High','Low','No','High','No'],
    ['Low','No','No','No','Yes'],
    ['Low','Low','No','High','No'],
    ['High','No','Yes','Low','Yes'],
    ['High','No','No','High','Yes'],
    ['No','No','No','No','No']
    ]
    df_raw =pd.DataFrame(data,columns=cols)
    return df_raw

In [3]:
#Data Pre Processing to shape in the encode form
def pre_processing():
    
    df=get_data()

    # One-Hot Encoding
    df = pd.get_dummies(df,columns=df.columns[:-1].values)

    #Order of the columns corresponding to the encoding provided in the problem
    ordered_columns=['cough_No','cough_Low','cough_High',
                     'fever_No','fever_Low','fever_High',
                     'Red-eye_No','Red-eye_Yes',
                     'body_No','body_Low','body_High',
                     'Divoc']

    #Get a list of all the features
    feature_list=ordered_columns[:-1]

    #get the target variable 'Divoc'. It's first element from the last of the list above.
    target=ordered_columns[-1]

    #Set the column order in the dataframe
    df=df[ordered_columns]

    #Convert target variable into binary
    df[target]=df[target].apply(lambda x:1 if x=='Yes' else 0 )
    
    return df,feature_list,target

In [4]:
#Define the initial hypothesis for population P0
def init_population():
    return np.array([list('010010110011'), list('101011001101'), list('101010101001'), list('010111101001')])

In [5]:
#Crossover operation with mask
def mask_crossover(population):
    # mask is given in the problem
    mask=list('000011101100') 
    
    #get number of chromosomes in the population
    r = population.shape[0]
    
    # select a pair of chromosomes at a time from the population to perform crossover
    for i in range(0,r,2):                
        for c in range(len(mask)):
            
            # if mask has 1 in a given position then flip the gene between 2 chromosomes
            if mask[c] == '1': 
                p = population[i][c]
                population[i][c] = population[i+1][c]
                population[i+1][c]=p
    
    #return the crossed over population
    return population 

In [6]:
#Fitness Function
def predictive_model(X,y):
    #choose a prediction model that works as a fitness function. 
    #Support Vector Machines is best suited for binary classification problems
    clf = svm.SVC(kernel='poly', degree=2, random_state=0)
    
    #Since the samples are also small, we have chosen cross validation over train/test split.
    scores = cross_val_score(clf, X, y, cv=6)

    #scores is an array of prediction for different combinations in cross validation   
    #return the mean values of scores
    return scores.mean()

In [7]:
#Get the fitness of a given population 
def get_fitness(data, feature_list, target, population):    
    #To store fitness values for each chromosome
    fitness = []
    
    # iterate for each chromosome   
    for i in range(population.shape[0]):     
        #select the genes(features) that are true (value==1) 
        columns = [feature_list[j] for j in range(population.shape[1]-1) if population[i,j]=='1']                    
        
        #get the prediction from the model for the selected genes on the data and add it fitness array
        fitness.append(predictive_model(data[columns], data[target]))
        
    return fitness

In [8]:
#Main function that controls the genetic algorithm.
def ga(data, feature_list, target, max_iter):
    
    #initate the population to the given population P0 in the problem
    population = init_population()
    
    #get the fitness scores of P0
    fitness    = get_fitness(data, feature_list, target, population)
    
    #select the best fitness value and the corresponding chromosome
    optimal_value= max(fitness)
    optimal_solution = population[np.where(fitness==optimal_value)][0]    
    
    #assign the baseline values to init variables
    init_fitness=optimal_value
    init_solution=optimal_solution
 
    #test the fitness for several generations
    for i in range(max_iter):
        #each crossover is one generation
        population = mask_crossover(population)
        fitness = get_fitness(data, feature_list, target, population)
        
        #get the fittest chromosomes for each generation and get the contributing genes.
        if max(fitness) > optimal_value:
            optimal_value    = max(fitness)
            optimal_solution = population[np.where(fitness==optimal_value)][0]                               
        
    return init_solution, init_fitness, optimal_solution, optimal_value

In [9]:
#Print the given sample data
get_data()

Unnamed: 0,cough,fever,Red-eye,body,Divoc
0,High,No,Yes,No,No
1,No,High,No,No,No
2,No,High,Yes,High,Yes
3,Low,High,Yes,High,Yes
4,Low,High,Yes,Low,Yes
5,No,High,No,Low,Yes
6,High,Low,Yes,Low,Yes
7,No,No,No,High,No
8,No,Low,Yes,High,Yes
9,High,Low,No,High,No


In [10]:
#Get ecoded data, features and target variable to predict
data, feature_list, target=pre_processing()
pd.DataFrame(feature_list,columns=['Features_List'])

Unnamed: 0,Features_List
0,cough_No
1,cough_Low
2,cough_High
3,fever_No
4,fever_Low
5,fever_High
6,Red-eye_No
7,Red-eye_Yes
8,body_No
9,body_Low


In [11]:
pd.DataFrame([target],columns=['Target'])

Unnamed: 0,Target
0,Divoc


In [12]:
print('Sample Input Data :')
data

Sample Input Data :


Unnamed: 0,cough_No,cough_Low,cough_High,fever_No,fever_Low,fever_High,Red-eye_No,Red-eye_Yes,body_No,body_Low,body_High,Divoc
0,0,0,1,1,0,0,0,1,1,0,0,0
1,1,0,0,0,0,1,1,0,1,0,0,0
2,1,0,0,0,0,1,0,1,0,0,1,1
3,0,1,0,0,0,1,0,1,0,0,1,1
4,0,1,0,0,0,1,0,1,0,1,0,1
5,1,0,0,0,0,1,1,0,0,1,0,1
6,0,0,1,0,1,0,0,1,0,1,0,1
7,1,0,0,1,0,0,1,0,0,0,1,0
8,1,0,0,0,1,0,0,1,0,0,1,1
9,0,0,1,0,1,0,1,0,0,0,1,0


In [13]:
# Execute Genetic Algorithm to obtain initial & optimized feature along with prediction scores
init_features, init_score, optimized_features, optimized_score = ga(data, feature_list, target, 1)

# Filter selected features
init_features = [feature_list[i] for i in range(len(feature_list)) if init_features[i]=='1']
optimized_features = [feature_list[i] for i in range(len(feature_list)) if optimized_features[i]=='1']

# Print List of Features
print('Initial Feature Set\n',init_features,'\nInitial Accuracy =', round(init_score*100), '%')
print('\nOptimal Feature Set\n',optimized_features,'\nOptimal Accuracy =', round(optimized_score*100), '%')

Initial Feature Set
 ['cough_No', 'cough_High', 'fever_Low', 'Red-eye_No', 'body_No'] 
Initial Accuracy = 75 %

Optimal Feature Set
 ['cough_No', 'cough_High', 'fever_Low', 'Red-eye_No'] 
Optimal Accuracy = 89 %


##### Done