<a href="https://colab.research.google.com/github/pascalpap20/PSO_SISDAS/blob/main/PSO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import modules
import numpy as np
import seaborn as sns
import pandas as pd

# Import PySwarms
import pyswarms as ps

In [138]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=15, n_classes=3,
                           n_informative=4, n_redundant=1, n_repeated=2,
                           random_state=1)
X

array([[ 6.55083667e-01, -2.12896325e+00, -6.85016261e-01, ...,
         8.84752891e-01, -6.01172980e-01,  9.18193198e-01],
       [-3.03401215e-01, -1.37832774e+00, -5.66001920e-01, ...,
        -4.94118281e-01, -5.12463879e-01,  1.41817410e-01],
       [ 7.24705998e-01, -9.86520969e-01,  3.89249109e-01, ...,
        -2.71892333e-01, -2.73139846e+00,  9.36141731e-01],
       ...,
       [ 6.71384895e-01, -3.36406540e-01,  5.63320696e-01, ...,
         5.72541137e-01, -2.33925006e+00, -6.85242411e-02],
       [ 1.91731864e+00, -3.06017562e-01,  1.84991652e+00, ...,
         2.97657017e-01,  3.07610220e-01, -7.30575562e-04],
       [ 1.47750104e+00,  1.62713025e+00, -1.23400574e+00, ...,
        -4.94581809e-01,  4.82936485e-01, -2.00930106e+00]])

In [99]:
# Plot toy dataset per feature
df = pd.DataFrame(X)
df['labels'] = pd.Series(y)

# sns.pairplot(df, hue='labels');
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,labels
0,0.655084,-2.128963,-0.685016,1.922747,0.645288,-2.224576,1.198653,-1.442570,0.225801,1.922747,2.454673,1.198653,0.884753,-0.601173,0.918193,0
1,-0.303401,-1.378328,-0.566002,0.114298,-0.239843,-0.310322,1.501327,-0.044018,-0.472958,0.114298,0.492101,1.501327,-0.494118,-0.512464,0.141817,0
2,0.724706,-0.986521,0.389249,-1.895799,0.042914,0.472371,-0.318019,0.951178,0.644809,-1.895799,0.418651,-0.318019,-0.271892,-2.731398,0.936142,0
3,0.049146,-0.588923,0.326328,0.922614,1.131195,-0.983339,0.549084,1.621353,1.479451,0.922614,-0.951512,0.549084,-0.470488,0.787868,-0.696832,2
4,0.378306,-1.327988,0.647906,0.877558,-1.091076,-1.513970,1.208688,-0.459604,1.786182,0.877558,-1.650467,1.208688,-0.393068,1.295161,-0.512984,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.177226,2.043693,1.136525,0.267609,-0.840022,2.573524,-0.365685,-1.487846,-0.479539,0.267609,-0.702859,-0.365685,1.137251,-1.456116,0.617562,2
96,0.540818,-0.568785,0.470516,2.667254,-0.087918,-3.171549,-1.952123,-1.275209,-0.458240,2.667254,1.099393,-1.952123,-0.022824,-0.656279,-1.181429,0
97,0.671385,-0.336407,0.563321,1.780646,0.784192,-0.075812,0.302305,0.317170,-0.217989,1.780646,-0.837611,0.302305,0.572541,-2.339250,-0.068524,0
98,1.917319,-0.306018,1.849917,-0.225949,0.609191,1.034543,1.791212,0.073941,0.314464,-0.225949,-0.253615,1.791212,0.297657,0.307610,-0.000731,1


In [None]:
from sklearn import linear_model

# Create an instance of the classifier
classifier = linear_model.LogisticRegression()

# Define objective function
def f_per_particle(m, alpha):
    """Computes for the objective function per particle

    Inputs
    ------
    m : numpy.ndarray
        Binary mask that can be obtained from BinaryPSO, will
        be used to mask features.
    alpha: float (default is 0.5)
        Constant weight for trading-off classifier performance
        and number of features

    Returns
    -------
    numpy.ndarray
        Computed objective function
    """
    total_features = 15
    # Get the subset of the features from the binary mask
    if np.count_nonzero(m) == 0:
        X_subset = X
    else:
        X_subset = X[:,m==1]
    # Perform classification and store performance in P
    classifier.fit(X_subset, y)
    P = (classifier.predict(X_subset) == y).mean()
    # Compute for the objective function
    j = (alpha * (1.0 - P)
        + (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))

    return j

In [None]:
def f(x, alpha=0.88):
    """Higher-level method to do classification in the
    whole swarm.

    Inputs
    ------
    x: numpy.ndarray of shape (n_particles, dimensions)
        The swarm that will perform the search

    Returns
    -------
    numpy.ndarray of shape (n_particles, )
        The computed loss for each particle
    """
    n_particles = x.shape[0]
    j = [f_per_particle(x[i], alpha) for i in range(n_particles)]
    return np.array(j)

In [None]:
# Initialize swarm, arbitrary
options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}

# Call instance of PSO
dimensions = 15 # dimensions should be the number of features
# optimizer.reset()
optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)

# Perform optimization
cost, pos = optimizer.optimize(f, iters=100, verbose=2)

In [None]:
# Create two instances of LogisticRegression
classfier = linear_model.LogisticRegression()

# Get the selected features from the final positions
X_selected_features = X[:,pos==1]  # subset

# Perform classification and store performance in P
classifier.fit(X_selected_features, y)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
df1 = pd.DataFrame(X_selected_features)
df1['labels'] = pd.Series(y)

sns.pairplot(df1, hue='labels')

# PSO DENGAN DATASET

In [None]:
# Import modules
import numpy as np
import seaborn as sns
import pandas as pd

# Import PySwarms
import pyswarms as ps

In [59]:
data = pd.read_csv('/content/drive/MyDrive/SISDAS_PSO/ObesityDataSet_raw_and_data_sinthetic.csv')

data
# data.columns

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


Bagi atribut dan labelnya

In [49]:

# cols = ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']
dataAtribut = data.iloc[:,0:16]
dataAtribut

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation


In [76]:
label = {"NObeyesdad":data['NObeyesdad']}
label = pd.DataFrame(data=label)
label

Unnamed: 0,NObeyesdad
0,Normal_Weight
1,Normal_Weight
2,Normal_Weight
3,Overweight_Level_I
4,Overweight_Level_II
...,...
2106,Obesity_Type_III
2107,Obesity_Type_III
2108,Obesity_Type_III
2109,Obesity_Type_III


# Praproses data

Label Encoding Atribut family_history_with_overweight, FAVC, SMOKE, SCC <- ordinal


In [43]:
# Label Encoding Atribut family_history_with_overweight, FAVC, SMOKE, SCC <- ordinal
from sklearn.preprocessing import LabelEncoder

l1 = LabelEncoder()

l1.fit(dataAtribut['family_history_with_overweight'])
dataAtribut.family_history_with_overweight = l1.transform(dataAtribut.family_history_with_overweight)

l1.fit(dataAtribut['FAVC'])
dataAtribut.FAVC = l1.transform(dataAtribut.FAVC)

l1.fit(dataAtribut['SMOKE'])
dataAtribut.SMOKE = l1.transform(dataAtribut.SMOKE)

l1.fit(dataAtribut['SCC'])
dataAtribut.SCC = l1.transform(dataAtribut.SCC)

dataAtribut

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Female,21.000000,1.620000,64.000000,1,0,2.0,3.0,Sometimes,0,2.000000,0,0.000000,1.000000,no,Public_Transportation
1,Female,21.000000,1.520000,56.000000,1,0,3.0,3.0,Sometimes,1,3.000000,1,3.000000,0.000000,Sometimes,Public_Transportation
2,Male,23.000000,1.800000,77.000000,1,0,2.0,3.0,Sometimes,0,2.000000,0,2.000000,1.000000,Frequently,Public_Transportation
3,Male,27.000000,1.800000,87.000000,0,0,3.0,3.0,Sometimes,0,2.000000,0,2.000000,0.000000,Frequently,Walking
4,Male,22.000000,1.780000,89.800000,0,0,2.0,1.0,Sometimes,0,2.000000,0,0.000000,0.000000,Sometimes,Public_Transportation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,1,1,3.0,3.0,Sometimes,0,1.728139,0,1.676269,0.906247,Sometimes,Public_Transportation
2107,Female,21.982942,1.748584,133.742943,1,1,3.0,3.0,Sometimes,0,2.005130,0,1.341390,0.599270,Sometimes,Public_Transportation
2108,Female,22.524036,1.752206,133.689352,1,1,3.0,3.0,Sometimes,0,2.054193,0,1.414209,0.646288,Sometimes,Public_Transportation
2109,Female,24.361936,1.739450,133.346641,1,1,3.0,3.0,Sometimes,0,2.852339,0,1.139107,0.586035,Sometimes,Public_Transportation


One Hot Encoding Atribut CAEC, CALC, MTRANS <- non-ordinal

In [54]:
transformed_dataset = pd.get_dummies(data=dataAtribut)
transformed_dataset

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,family_history_with_overweight_no,family_history_with_overweight_yes,FAVC_no,FAVC_yes,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,SMOKE_no,SMOKE_yes,SCC_no,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21.000000,1.620000,64.000000,2.0,3.0,2.000000,0.000000,1.000000,1,0,0,1,1,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0
1,21.000000,1.520000,56.000000,3.0,3.0,3.000000,3.000000,0.000000,1,0,0,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0
2,23.000000,1.800000,77.000000,2.0,3.0,2.000000,2.000000,1.000000,0,1,0,1,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0
3,27.000000,1.800000,87.000000,3.0,3.0,2.000000,2.000000,0.000000,0,1,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1
4,22.000000,1.780000,89.800000,2.0,1.0,2.000000,0.000000,0.000000,0,1,1,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,20.976842,1.710730,131.408528,3.0,3.0,1.728139,1.676269,0.906247,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,1,0
2107,21.982942,1.748584,133.742943,3.0,3.0,2.005130,1.341390,0.599270,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,1,0
2108,22.524036,1.752206,133.689352,3.0,3.0,2.054193,1.414209,0.646288,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,1,0
2109,24.361936,1.739450,133.346641,3.0,3.0,2.852339,1.139107,0.586035,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,1,0


In [63]:
transformed_dataset.columns

Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender_Female', 'Gender_Male', 'family_history_with_overweight_no',
       'family_history_with_overweight_yes', 'FAVC_no', 'FAVC_yes',
       'CAEC_Always', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no',
       'SMOKE_no', 'SMOKE_yes', 'SCC_no', 'SCC_yes', 'CALC_Always',
       'CALC_Frequently', 'CALC_Sometimes', 'CALC_no', 'MTRANS_Automobile',
       'MTRANS_Bike', 'MTRANS_Motorbike', 'MTRANS_Public_Transportation',
       'MTRANS_Walking'],
      dtype='object')

In [148]:
from sklearn.preprocessing import StandardScaler

scaled_features = transformed_dataset.copy()
# normalized_data = transformed_dataset[['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
#        'Gender_Female', 'Gender_Male', 'family_history_with_overweight_no',
#        'family_history_with_overweight_yes', 'FAVC_no', 'FAVC_yes',
#        'CAEC_Always', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no',
#        'SMOKE_no', 'SMOKE_yes', 'SCC_no', 'SCC_yes', 'CALC_Always',
#        'CALC_Frequently', 'CALC_Sometimes', 'CALC_no', 'MTRANS_Automobile',
#        'MTRANS_Bike', 'MTRANS_Motorbike', 'MTRANS_Public_Transportation',
#        'MTRANS_Walking']]

col_names = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender_Female', 'Gender_Male', 'family_history_with_overweight_no',
       'family_history_with_overweight_yes', 'FAVC_no', 'FAVC_yes',
       'CAEC_Always', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no',
       'SMOKE_no', 'SMOKE_yes', 'SCC_no', 'SCC_yes', 'CALC_Always',
       'CALC_Frequently', 'CALC_Sometimes', 'CALC_no', 'MTRANS_Automobile',
       'MTRANS_Bike', 'MTRANS_Motorbike', 'MTRANS_Public_Transportation',
       'MTRANS_Walking']
features = scaled_features[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
scaled_features[col_names] = features

normalized_data = pd.DataFrame(scaled_features)
# normalized_data = normalized_data.apply(lambda x: (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)))
normalized_data

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,family_history_with_overweight_no,family_history_with_overweight_yes,FAVC_no,FAVC_yes,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,SMOKE_no,SMOKE_yes,SCC_no,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,-0.522124,-0.875589,-0.862558,-0.785019,0.404153,-0.013073,-1.188039,0.561997,1.011914,-1.011914,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,-1.404720,1.517761,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
1,-0.522124,-1.947599,-1.168077,1.088342,0.404153,1.618759,2.339750,-1.080625,1.011914,-1.011914,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,-6.853997,6.853997,-4.581439,4.581439,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
2,-0.206889,1.054029,-0.366090,-0.785019,0.404153,-0.013073,1.163820,0.561997,-0.988227,0.988227,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,5.399735,-1.404720,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
3,0.423582,1.054029,0.015808,1.088342,0.404153,-0.013073,1.163820,-1.080625,-0.988227,0.988227,2.117337,-2.117337,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,5.399735,-1.404720,-0.658865,-0.525642,-0.05768,-0.072375,-1.724969,6.057758
4,-0.364507,0.839627,0.122740,-0.785019,-2.167023,-0.013073,-1.188039,-1.080625,-0.988227,0.988227,2.117337,-2.117337,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,-0.525774,0.097045,1.711763,1.088342,0.404153,-0.456705,0.783135,0.407996,1.011914,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
2107,-0.367195,0.502844,1.800914,1.088342,0.404153,-0.004702,0.389341,-0.096251,1.011914,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
2108,-0.281909,0.541672,1.798868,1.088342,0.404153,0.075361,0.474971,-0.019018,1.011914,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
2109,0.007776,0.404927,1.785780,1.088342,0.404153,1.377801,0.151471,-0.117991,1.011914,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078


In [149]:
preprocessed_data = normalized_data.join(label)
preprocessed_data

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,family_history_with_overweight_no,family_history_with_overweight_yes,FAVC_no,FAVC_yes,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,SMOKE_no,SMOKE_yes,SCC_no,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,NObeyesdad
0,-0.522124,-0.875589,-0.862558,-0.785019,0.404153,-0.013073,-1.188039,0.561997,1.011914,-1.011914,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,-1.404720,1.517761,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Normal_Weight
1,-0.522124,-1.947599,-1.168077,1.088342,0.404153,1.618759,2.339750,-1.080625,1.011914,-1.011914,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,-6.853997,6.853997,-4.581439,4.581439,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Normal_Weight
2,-0.206889,1.054029,-0.366090,-0.785019,0.404153,-0.013073,1.163820,0.561997,-0.988227,0.988227,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,5.399735,-1.404720,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Normal_Weight
3,0.423582,1.054029,0.015808,1.088342,0.404153,-0.013073,1.163820,-1.080625,-0.988227,0.988227,2.117337,-2.117337,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,5.399735,-1.404720,-0.658865,-0.525642,-0.05768,-0.072375,-1.724969,6.057758,Overweight_Level_I
4,-0.364507,0.839627,0.122740,-0.785019,-2.167023,-0.013073,-1.188039,-1.080625,-0.988227,0.988227,2.117337,-2.117337,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,-0.525774,0.097045,1.711763,1.088342,0.404153,-0.456705,0.783135,0.407996,1.011914,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Obesity_Type_III
2107,-0.367195,0.502844,1.800914,1.088342,0.404153,-0.004702,0.389341,-0.096251,1.011914,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Obesity_Type_III
2108,-0.281909,0.541672,1.798868,1.088342,0.404153,0.075361,0.474971,-0.019018,1.011914,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Obesity_Type_III
2109,0.007776,0.404927,1.785780,1.088342,0.404153,1.377801,0.151471,-0.117991,1.011914,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.02177,-0.185194,0.711885,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Obesity_Type_III


In [150]:
test = normalized_data.to_numpy()
# test.shape
labelTest = label.to_numpy().flatten()
labelTest.shape

(2111,)

In [155]:
from sklearn import linear_model

# Create an instance of the classifier
classifier = linear_model.LogisticRegression(max_iter=1000)

# Define objective function
def f_per_particle(m, alpha):
    """Computes for the objective function per particle

    Inputs
    ------
    m : numpy.ndarray
        Binary mask that can be obtained from BinaryPSO, will
        be used to mask features.
    alpha: float (default is 0.5)
        Constant weight for trading-off classifier performance
        and number of features

    Returns
    -------
    numpy.ndarray
        Computed objective function
    """
    total_features = 31
    # Get the subset of the features from the binary mask
    if np.count_nonzero(m) == 0:
        X_subset = normalized_data.to_numpy()
    else:
        X_subset = normalized_data.to_numpy()[:,m==1]
    # Perform classification and store performance in P
    classifier.fit(X_subset, label.to_numpy().flatten())
    P = (classifier.predict(X_subset) == label.to_numpy().flatten()).mean()
    # Compute for the objective function
    j = (alpha * (1.0 - P)
        + (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))

    return j

In [156]:
def f(x, alpha=0.88):
    """Higher-level method to do classification in the
    whole swarm.

    Inputs
    ------
    x: numpy.ndarray of shape (n_particles, dimensions)
        The swarm that will perform the search

    Returns
    -------
    numpy.ndarray of shape (n_particles, )
        The computed loss for each particle
    """
    n_particles = x.shape[0]
    j = [f_per_particle(x[i], alpha) for i in range(n_particles)]
    return np.array(j)

In [159]:
# Initialize swarm, arbitrary
options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}

# Call instance of PSO
dimensions = 31 # dimensions should be the number of features
# optimizer.reset()
optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)

# Perform optimization
cost, pos = optimizer.optimize(f, iters=100, verbose=2)

2021-06-13 11:54:57,276 - pyswarms.discrete.binary - INFO - Optimize for 100 iters with {'c1': 0.5, 'c2': 0.5, 'w': 0.9, 'k': 30, 'p': 2}

















pyswarms.discrete.binary:   0%|          |0/100[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















pyswarms.discrete.binary:   0%|          |0/100, best_cost=0.122[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















pyswarms.discrete.binary:   1%|          |1/100, best_cost=0.122[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















pyswarms.discrete.binary:   1%|          |1/100, best_cost=0.111[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















pyswarms.discrete.binary:   2%|▏         |2/100, best_cost=0.111[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















pyswarms.discrete.binary:   2%|▏         |2/100, best_cost=0.111[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















pyswarms.discrete.binary:   3%|▎         |3/100, best_cost=0.111[A[A

In [162]:
# Create two instances of LogisticRegression
classfier = linear_model.LogisticRegression()

# Get the selected features from the final positions
X_selected_features = normalized_data.to_numpy()[:,pos==1]  # subset

# Perform classification and store performance in P
classifier.fit(X_selected_features, label.to_numpy().flatten())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [169]:
pos

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1])

In [181]:
normalized_data.columns

Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender_Female', 'Gender_Male', 'family_history_with_overweight_no',
       'family_history_with_overweight_yes', 'FAVC_no', 'FAVC_yes',
       'CAEC_Always', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no',
       'SMOKE_no', 'SMOKE_yes', 'SCC_no', 'SCC_yes', 'CALC_Always',
       'CALC_Frequently', 'CALC_Sometimes', 'CALC_no', 'MTRANS_Automobile',
       'MTRANS_Bike', 'MTRANS_Motorbike', 'MTRANS_Public_Transportation',
       'MTRANS_Walking'],
      dtype='object')

In [182]:
selected_column = []
for i in range (0, len(pos)):
  if pos[i] == 1:
    selected_column.append(normalized_data.columns[i])

selected_column

['Height',
 'Weight',
 'FCVC',
 'NCP',
 'CH2O',
 'FAF',
 'TUE',
 'Gender_Male',
 'family_history_with_overweight_no',
 'family_history_with_overweight_yes',
 'FAVC_no',
 'FAVC_yes',
 'CAEC_Always',
 'CAEC_Frequently',
 'CAEC_Sometimes',
 'CAEC_no',
 'SMOKE_no',
 'SMOKE_yes',
 'SCC_no',
 'SCC_yes',
 'CALC_Frequently',
 'CALC_no',
 'MTRANS_Automobile',
 'MTRANS_Bike',
 'MTRANS_Motorbike',
 'MTRANS_Public_Transportation',
 'MTRANS_Walking']

In [184]:
final_dataset_attribute = normalized_data[selected_column]
final_dataset_attribute

Unnamed: 0,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Male,family_history_with_overweight_no,family_history_with_overweight_yes,FAVC_no,FAVC_yes,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,SMOKE_no,SMOKE_yes,SCC_no,SCC_yes,CALC_Frequently,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,-0.875589,-0.862558,-0.785019,0.404153,-0.013073,-1.188039,0.561997,-1.011914,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,1.517761,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
1,-1.947599,-1.168077,1.088342,0.404153,1.618759,2.339750,-1.080625,-1.011914,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,-6.853997,6.853997,-4.581439,4.581439,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
2,1.054029,-0.366090,-0.785019,0.404153,-0.013073,1.163820,0.561997,0.988227,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,5.399735,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
3,1.054029,0.015808,1.088342,0.404153,-0.013073,1.163820,-1.080625,0.988227,2.117337,-2.117337,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,5.399735,-0.658865,-0.525642,-0.05768,-0.072375,-1.724969,6.057758
4,0.839627,0.122740,-0.785019,-2.167023,-0.013073,-1.188039,-1.080625,0.988227,2.117337,-2.117337,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0.097045,1.711763,1.088342,0.404153,-0.456705,0.783135,0.407996,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
2107,0.502844,1.800914,1.088342,0.404153,-0.004702,0.389341,-0.096251,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
2108,0.541672,1.798868,1.088342,0.404153,0.075361,0.474971,-0.019018,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078
2109,0.404927,1.785780,1.088342,0.404153,1.377801,0.151471,-0.117991,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078


In [186]:
final_dataset = final_dataset_attribute.join(label)
final_dataset

Unnamed: 0,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Male,family_history_with_overweight_no,family_history_with_overweight_yes,FAVC_no,FAVC_yes,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,SMOKE_no,SMOKE_yes,SCC_no,SCC_yes,CALC_Frequently,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,NObeyesdad
0,-0.875589,-0.862558,-0.785019,0.404153,-0.013073,-1.188039,0.561997,-1.011914,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,1.517761,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Normal_Weight
1,-1.947599,-1.168077,1.088342,0.404153,1.618759,2.339750,-1.080625,-1.011914,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,-6.853997,6.853997,-4.581439,4.581439,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Normal_Weight
2,1.054029,-0.366090,-0.785019,0.404153,-0.013073,1.163820,0.561997,0.988227,-0.472291,0.472291,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,5.399735,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Normal_Weight
3,1.054029,0.015808,1.088342,0.404153,-0.013073,1.163820,-1.080625,0.988227,2.117337,-2.117337,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,5.399735,-0.658865,-0.525642,-0.05768,-0.072375,-1.724969,6.057758,Overweight_Level_I
4,0.839627,0.122740,-0.785019,-2.167023,-0.013073,-1.188039,-1.080625,0.988227,2.117337,-2.117337,2.759769,-2.759769,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0.097045,1.711763,1.088342,0.404153,-0.456705,0.783135,0.407996,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Obesity_Type_III
2107,0.502844,1.800914,1.088342,0.404153,-0.004702,0.389341,-0.096251,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Obesity_Type_III
2108,0.541672,1.798868,1.088342,0.404153,0.075361,0.474971,-0.019018,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Obesity_Type_III
2109,0.404927,1.785780,1.088342,0.404153,1.377801,0.151471,-0.117991,-1.011914,-0.472291,0.472291,-0.362349,0.362349,-0.160478,-0.359835,0.442757,-0.157344,0.145900,-0.145900,0.218272,-0.218272,-0.185194,-0.658865,-0.525642,-0.05768,-0.072375,0.579721,-0.165078,Obesity_Type_III


Klasifikasi Logistic Regression dengan fitur selection

In [239]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(final_dataset_attribute, label, test_size=0.3, random_state=11)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train.to_numpy().flatten())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [240]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.88


In [222]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[ 86   4   0   0   0   0   0]
 [ 11  68   0   0   0   5   3]
 [  0   0  92   3   5   0   3]
 [  0   0   0  90   0   0   0]
 [  0   0   0   1 100   0   0]
 [  0   8   1   0   0  62  10]
 [  0   1   8   0   1   9  63]]


In [200]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

                     precision    recall  f1-score   support

Insufficient_Weight       0.89      0.96      0.92        90
      Normal_Weight       0.84      0.78      0.81        87
     Obesity_Type_I       0.91      0.89      0.90       103
    Obesity_Type_II       0.96      1.00      0.98        90
   Obesity_Type_III       0.94      0.99      0.97       101
 Overweight_Level_I       0.82      0.77      0.79        81
Overweight_Level_II       0.80      0.77      0.78        82

           accuracy                           0.88       634
          macro avg       0.88      0.88      0.88       634
       weighted avg       0.88      0.88      0.88       634



Klasifikasi Logistic Regression tanpa fitur selection

In [241]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(normalized_data, label, test_size=0.3, random_state=11)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train.to_numpy().flatten())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [242]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.86


In [215]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[ 71   3   0   0   0   0   0]
 [ 16  58   0   0   0  13   5]
 [  0   0  90   6   1   1   2]
 [  0   0   3 105   0   0   0]
 [  0   0   1   1 100   0   0]
 [  0   6   0   0   0  60   8]
 [  0   3   8   3   0   9  61]]


In [216]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

                     precision    recall  f1-score   support

Insufficient_Weight       0.82      0.96      0.88        74
      Normal_Weight       0.83      0.63      0.72        92
     Obesity_Type_I       0.88      0.90      0.89       100
    Obesity_Type_II       0.91      0.97      0.94       108
   Obesity_Type_III       0.99      0.98      0.99       102
 Overweight_Level_I       0.72      0.81      0.76        74
Overweight_Level_II       0.80      0.73      0.76        84

           accuracy                           0.86       634
          macro avg       0.85      0.85      0.85       634
       weighted avg       0.86      0.86      0.86       634

