In [1]:
import pandas as pd
from sklearn.svm import SVC
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score
import numpy as np
from sklearn.metrics.pairwise import pairwise_kernels as pk

In [2]:
# Kernells definition
def linearKernell(X,Y):
    return pk(X,Y,metric='linear')

def gaussianKernel(X, Y):
    return pk(X,Y,metric='rbf')

def LaplaceKernel(X, Y):
    return pk(X,Y,metric='laplacian')

def tanhKernel(X, Y): 
    return pk(X,Y,metric='sigmoid')

def polynomial_kernel(X, Y):
    return pk(X,Y,metric='polynomial')

def cosine_kernel(X, Y):
    return pk(X,Y,metric='cosine')

In [3]:
def CrearTablaFrec(Datos,bins):
    frec=[]
    dataCol = pd.cut(Datos,bins=bins)
    for i in range(len(np.unique(dataCol))):
        frec.append(len(np.where(dataCol==np.unique(dataCol)[i])[0]))
        frecacum=np.cumsum(frec)
    TF=pd.DataFrame(np.transpose([np.unique(dataCol),frec,frecacum,frecacum/sum(frec)]),columns=['Intervalos','Frecuencias','frecuencias Acumuladas','Frecuencias Relativas'])
    return TF,dataCol

In [4]:
def datasim(data,bins,sim):
    from tqdm import tqdm
    empT=[]
    for j in tqdm(range(data.shape[1])): 
        emp=[]   
        for i in data.iloc[:,j]:
            emp.append(np.sum(data.iloc[:,j]<=i)/len(data.iloc[:,j]))
        empT.append(emp)
    TablasFrec=[]
    DataInt=[]
    for i in tqdm(range(data.shape[1])):
        T,D=CrearTablaFrec(data.iloc[:,i],bins)
        TablasFrec.append(T)
        DataInt.append(D)
    DataInt=np.transpose(np.array(DataInt))
    XT=[]
    for i in tqdm(range(sim)):
        X=[]
        PosSim=int(np.random.uniform(0,len(data)-1))
        for i in range(data.shape[1]):
            inter=TablasFrec[i][TablasFrec[i]['Frecuencias Relativas']>=np.transpose(np.array(empT))[PosSim][i]].reset_index()['Intervalos'][0]
            X.append(np.random.uniform(low=inter.left, high=inter.right))
        XT.append(X)
    return XT

In [5]:
# df = pd.read_csv("Bajo peso al nacer.csv",sep=';')
# df = df.dropna()
# df = df[df['peso_nacer']!='SD']
# df['peso_nacer']=pd.to_numeric(df['peso_nacer'])
# df['talla_nacer']=df['talla_nacer'].apply(lambda x: float(x.replace(',','.')))
# df['pac_hos_']=(df['pac_hos_']-2)*(-1)

In [6]:
# #splitting the dataset into independent and dependent datasets
# X = df.drop(['pac_hos_'],axis=1).values
# X[:,6]=pd.to_numeric(X[:,6])
# Y = np.array(df['pac_hos_'])

In [7]:
df = pd.read_csv("gender_classification_v7.csv",sep=',')

In [8]:
#splitting the dataset into independent and dependent datasets
X = df.iloc[:,:-1].values
Y = df.loc[:,'gender'].values
Y[Y=='Male']=1
Y[Y=='Female']=0
Y=Y.astype('int')

In [9]:
len(Y[Y==1]),len(Y[Y==0])

(2500, 2501)

In [10]:
#splitting the dataset into training(75%) and testing(25%)
from sklearn.model_selection import train_test_split
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size=0.25)

In [11]:
#Scaling the data(feature scaling)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [12]:
def kernel_bootstrap(X, y, kernells_fun, nb=100):
    from tqdm import tqdm
    F1 = np.zeros([nb, len(kernells_fun)])
    for i in tqdm(range(nb)):

        idx = np.array(list(np.random.choice(np.where(y==1)[0], 250, replace=True)) + list(np.random.choice(np.where(y==0)[0], 250, replace=True)))
    
        X_sample = X[idx]
        y_sample = y[idx]
        X_train, X_test, Y_train, Y_test = train_test_split(X_sample,y_sample,test_size=0.25)

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.fit_transform(X_test)

        for j, kernell in enumerate(kernells_fun):
            clf = SVC(kernel=kernell)
            clf.fit(X_train, Y_train)
            y_pred = clf.predict(X_test)
            F1[i, j]= (f1_score(Y_test, y_pred))   
    return F1

In [13]:
kernells_fun=[gaussianKernel,LaplaceKernel,tanhKernel,polynomial_kernel,cosine_kernel]
F1=kernel_bootstrap(X,Y,kernells_fun,1000)

100%|██████████| 1000/1000 [01:04<00:00, 15.54it/s]


In [14]:
F1_median = (np.median(F1, axis=0))/(np.sum(np.median(F1, axis=0)))
pond=[]
for i in range(len(F1_median)):
    if np.logical_or(np.where(np.sort(F1_median)==F1_median[i])[0]==len(F1_median)-1,np.where(np.sort(F1_median)==F1_median[i])[0]==len(F1_median)-2):
        pond.append(i)
        
# F1_median=F1_median/np.sum(F1_median)

In [26]:
kern=np.zeros(len(F1_median))
kern[pond]=0.5
kern

array([0.5, 0.5, 0. , 0. , 0. ])

In [27]:
F1_median=kern

In [28]:
def combined(X,Y):
    com = F1_median[3]*polynomial_kernel(X, Y)+F1_median[0]*gaussianKernel(X,Y) + F1_median[1] * LaplaceKernel(X, Y) +  F1_median[2] * tanhKernel(X, Y) + F1_median[4]*cosine_kernel(X, Y)
    return com

In [29]:
F1_median

array([0.5, 0.5, 0. , 0. , 0. ])

In [30]:
clf = SVC(kernel=combined)
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
print(classification_report(Y_test, y_pred),f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       650
           1       0.98      0.96      0.97       601

    accuracy                           0.97      1251
   macro avg       0.97      0.97      0.97      1251
weighted avg       0.97      0.97      0.97      1251
 0.9689336691855585


In [31]:
clf = SVC(kernel=gaussianKernel)
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
print(classification_report(Y_test, y_pred),f1_score(Y_test, y_pred))

In [None]:
clf = SVC(kernel=LaplaceKernel)
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
print(classification_report(Y_test, y_pred),f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       650
           1       0.98      0.96      0.97       601

    accuracy                           0.97      1251
   macro avg       0.97      0.97      0.97      1251
weighted avg       0.97      0.97      0.97      1251
 0.9680672268907563


In [17]:
X1_sim=datasim(pd.DataFrame(X[Y==1]),50,1000)
X0_sim=datasim(pd.DataFrame(X[Y==0]),50,1000)

100%|██████████| 8/8 [01:33<00:00, 11.70s/it]
100%|██████████| 8/8 [00:10<00:00,  1.35s/it]
100%|██████████| 1000/1000 [01:07<00:00, 14.76it/s]
100%|██████████| 8/8 [00:23<00:00,  2.96s/it]
100%|██████████| 8/8 [00:05<00:00,  1.37it/s]
100%|██████████| 1000/1000 [00:48<00:00, 20.65it/s]


In [42]:
sc = StandardScaler()
X1_test = sc.fit_transform(X1_sim)
X0_test = sc.fit_transform(X0_sim)

In [43]:
y_pred = clf.predict(np.vstack((X0_test,X1_test)))
f1_score(np.hstack((np.zeros(len(X0_test)),np.ones(len(X1_test)))), y_pred)

0.6867167919799498

In [44]:
y_pred = clf.predict(np.vstack((X0_test,X1_test)))
print(classification_report(np.hstack((np.zeros(len(X0_test)),np.ones(len(X1_test)))), y_pred))
# f1_score(np.zeros(len(y_pred)), y_pred)

              precision    recall  f1-score   support

         0.0       0.80      0.17      0.28      1000
         1.0       0.53      0.96      0.69      1000

    accuracy                           0.56      2000
   macro avg       0.67      0.56      0.48      2000
weighted avg       0.67      0.56      0.48      2000



In [46]:
clf = SVC(kernel=gaussianKernel)
clf.fit(X_train, Y_train)
y_pred = clf.predict(np.vstack((X0_test,X1_test)))
print(classification_report(np.hstack((np.zeros(len(X0_test)),np.ones(len(X1_test)))), y_pred))
# f1_score(np.hstack((np.zeros(len(X0_test)),np.ones(len(X1_test)))), y_pred)

              precision    recall  f1-score   support

         0.0       0.83      0.18      0.30      1000
         1.0       0.54      0.96      0.69      1000

    accuracy                           0.57      2000
   macro avg       0.68      0.57      0.50      2000
weighted avg       0.68      0.57      0.50      2000



In [50]:
Xprueba=np.vstack((np.array(X1_sim),np.array(X0_sim)))
Yprueba=np.vstack((np.ones(len(X1_sim)).reshape(len(X1_sim),1),np.zeros(len(X0_sim)).reshape(len(X0_sim),1)))

In [53]:
#splitting the dataset into training(75%) and testing(25%)
from sklearn.model_selection import train_test_split
X_train, X_test,Y_train, Y_test = train_test_split(Xprueba,Yprueba,test_size=0.25)
#Scaling the data(feature scaling)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
X_test_or= sc.fit_transform(X)

In [56]:
clf = SVC(kernel=combined)
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.52      0.52      0.52        25
         1.0       0.52      0.52      0.52        25

    accuracy                           0.52        50
   macro avg       0.52      0.52      0.52        50
weighted avg       0.52      0.52      0.52        50



  y_ = column_or_1d(y, warn=True)


Kernel lineal

In [None]:
F1=[]

kernells_fun=[linearKernell,gaussianKernel,LaplaceKernel,tanhKernel,polynomial_kernel,cosine_kernel]

for kernell in kernells_fun:
    clf = SVC(kernel=kernell)
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(Y_test, y_pred))
    F1.append(f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00       818
           1       0.64      1.00      0.78      1456

    accuracy                           0.64      2274
   macro avg       0.82      0.50      0.39      2274
weighted avg       0.77      0.64      0.50      2274

              precision    recall  f1-score   support

           0       0.63      0.27      0.38       818
           1       0.69      0.91      0.78      1456

    accuracy                           0.68      2274
   macro avg       0.66      0.59      0.58      2274
weighted avg       0.67      0.68      0.64      2274

              precision    recall  f1-score   support

           0       0.64      0.35      0.45       818
           1       0.71      0.89      0.79      1456

    accuracy                           0.70      2274
   macro avg       0.67      0.62      0.62      2274
weighted avg       0.68      0.70      0.67      2274

              preci

In [None]:
Ponderaciones=F1/sum(F1)
Ponderaciones

array([0.17265485, 0.17337196, 0.1743782 , 0.13216438, 0.17477576,
       0.17265485])

In [None]:
def combined(X,Y):
    com = Ponderaciones[0]*linearKernell(X,Y) + Ponderaciones[1]*gaussianKernel(X,Y) 
    + Ponderaciones[2]*LaplaceKernel(X, Y) + Ponderaciones[3]*tanhKernel(X, Y)
    + Ponderaciones[4]*polynomial_kernel(X, Y)+ Ponderaciones[5]*cosine_kernel(X, Y)
    return com

In [None]:
clf = SVC(kernel=combined)
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.24      0.35       818
           1       0.69      0.94      0.79      1456

    accuracy                           0.68      2274
   macro avg       0.68      0.59      0.57      2274
weighted avg       0.68      0.68      0.63      2274



In [None]:
def kernel_bootstrap(X, y, kernells_fun, nb=100):
    from tqdm import tqdm
    F1 = np.zeros([nb, len(kernells_fun)])
    for i in tqdm(range(nb)):

        idx = np.array(list(np.random.choice(np.where(y==1)[0], 250, replace=True)) + list(np.random.choice(np.where(y==0)[0], 250, replace=True)))

        X_sample = X[idx]
        y_sample = y[idx]
        X_train, X_test, Y_train, Y_test = train_test_split(X_sample,y_sample,test_size=0.25)

        # clf = SVC(kernel=kernells_fun[0])
        # clf.fit(X_train, Y_train)
        # y_pred = clf.predict(X_test)
        # F1[i, 0]= (f1_score(Y_test, y_pred))

        # clf = SVC(kernel=kernells_fun[1])
        # clf.fit(X_train, Y_train)
        # y_pred = clf.predict(X_test)
        # F1[i, 1]= (f1_score(Y_test, y_pred))

        # clf = SVC(kernel=kernells_fun[2])
        # clf.fit(X_train, Y_train)
        # y_pred = clf.predict(X_test)
        # F1[i, 2]= (f1_score(Y_test, y_pred))

        # clf = SVC(kernel=kernells_fun[3])
        # clf.fit(X_train, Y_train)
        # y_pred = clf.predict(X_test)
        # F1[i, 3]= (f1_score(Y_test, y_pred))

        clf = SVC(kernel=kernells_fun[4])
        clf.fit(X_train, Y_train)
        y_pred = clf.predict(X_test)
        F1[i, 4]= (f1_score(Y_test, y_pred))

        clf = SVC(kernel=kernells_fun[5])
        clf.fit(X_train, Y_train)
        y_pred = clf.predict(X_test)
        F1[i, 5]= (f1_score(Y_test, y_pred))

        # for j, kernell in tqdm(enumerate(kernells_fun)):
        #     clf = SVC(kernel=kernell)
        #     clf.fit(X_train, Y_train)
        #     y_pred = clf.predict(X_test)
        #     F1[i, j]= (f1_score(Y_test, y_pred))   
    return F1

In [None]:
kernells_fun=[linearKernell,gaussianKernel,LaplaceKernel,tanhKernel,polynomial_kernel,cosine_kernel]
kernel_bootstrap(X,Y,kernells_fun,10)

  0%|          | 0/10 [00:00<?, ?it/s]

In [15]:
F1=[]
clf = SVC(kernel="linear")
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
print(classification_report(Y_test, y_pred))
F1.append(f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.01      0.01       769
           1       0.66      1.00      0.80      1505

    accuracy                           0.66      2274
   macro avg       0.83      0.50      0.40      2274
weighted avg       0.78      0.66      0.53      2274



Kernel custom

In [16]:
from sklearn.gaussian_process.kernels import ConstantKernel, RBF

In [17]:
clf_custom = SVC(kernel='rbf')
clf_custom.fit(X_train, Y_train)
y_pred = clf_custom.predict(X_test)
print(classification_report(Y_test, y_pred))
F1.append(f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.29      0.40       769
           1       0.72      0.91      0.80      1505

    accuracy                           0.70      2274
   macro avg       0.67      0.60      0.60      2274
weighted avg       0.69      0.70      0.67      2274



In [18]:
clf_custom = SVC(kernel='poly')
clf_custom.fit(X_train, Y_train)
y_pred = clf_custom.predict(X_test)
print(classification_report(Y_test, y_pred))
F1.append(f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.20      0.31       769
           1       0.70      0.95      0.81      1505

    accuracy                           0.70      2274
   macro avg       0.69      0.58      0.56      2274
weighted avg       0.69      0.70      0.64      2274



In [19]:
clf_custom = SVC(kernel='sigmoid')
clf_custom.fit(X_train, Y_train)
y_pred = clf_custom.predict(X_test)
print(classification_report(Y_test, y_pred))
F1.append(f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.31      0.33      0.32       769
           1       0.65      0.63      0.64      1505

    accuracy                           0.53      2274
   macro avg       0.48      0.48      0.48      2274
weighted avg       0.54      0.53      0.53      2274



In [20]:
Ponderaciones=F1/sum(F1)
Ponderaciones

array([0.26182423, 0.26365142, 0.26476778, 0.20975657])

Kernel combination

In [21]:
F1 = []

In [14]:
clf_custom = SVC(kernel=polynomial_kernel)
clf_custom.fit(X_train, Y_train)
y_pred = clf_custom.predict(X_test)
print(classification_report(Y_test, y_pred))
F1.append(f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.25      0.37       823
           1       0.69      0.95      0.80      1451

    accuracy                           0.69      2274
   macro avg       0.71      0.60      0.58      2274
weighted avg       0.70      0.69      0.64      2274



NameError: name 'F1' is not defined

In [14]:
clf_custom = SVC(kernel=polynomial_kernel)
clf_custom.fit(X_train, Y_train)
y_pred = clf_custom.predict(X_test)
print(classification_report(Y_test, y_pred))
F1.append(f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.25      0.37       814
           1       0.69      0.94      0.80      1460

    accuracy                           0.69      2274
   macro avg       0.69      0.59      0.58      2274
weighted avg       0.69      0.69      0.64      2274



NameError: name 'F1' is not defined

In [52]:
clf_custom = SVC(kernel=gaussianKernel)
clf_custom.fit(X_train, Y_train)
y_pred = clf_custom.predict(X_test)
print(classification_report(Y_test, y_pred))
# F1.append(f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.28      0.38       812
           1       0.69      0.91      0.79      1462

    accuracy                           0.68      2274
   macro avg       0.66      0.59      0.58      2274
weighted avg       0.67      0.68      0.64      2274



In [26]:
clf_custom = SVC(kernel=tanhKernel)
clf_custom.fit(X_train, Y_train)
y_pred = clf_custom.predict(X_test)
print(classification_report(Y_test, y_pred))
F1.append(f1_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.31      0.34      0.32       769
           1       0.64      0.61      0.63      1505

    accuracy                           0.52      2274
   macro avg       0.48      0.48      0.48      2274
weighted avg       0.53      0.52      0.52      2274



In [27]:
F1

[0.7971139971139971,
 0.7973509933774834,
 0.8007170600537795,
 0.6268350973028337]

In [41]:
Ponderaciones=np.ones(4)*1/4
Ponderaciones

array([0.25, 0.25, 0.25, 0.25])

In [31]:
def combined(X,Y):
    com = Ponderaciones[0]*polynomial_kernel(X,Y) + Ponderaciones[1]*linearKernell(X,Y) 
    + Ponderaciones[2]*gaussianKernel2(X, Y) + Ponderaciones[3]*tanhKernel(X, Y)
    return com

In [32]:
clf_custom = SVC(kernel=combined)
clf_custom.fit(X_train, Y_train)
y_pred = clf_custom.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.26      0.36       769
           1       0.70      0.90      0.79      1505

    accuracy                           0.69      2274
   macro avg       0.64      0.58      0.58      2274
weighted avg       0.66      0.69      0.65      2274



In [33]:
f1_score(Y_test, y_pred)

0.7925516438754728

In [40]:
def kernel_bootstrap(X, y, nb=100):
    from tqdm import tqdm
    F1 = np.zeros([nb, 4])
    for i in tqdm(range(nb)):
        idx = np.array(list(np.random.choice(np.where(y==1)[0], 250, replace=True)) + list(np.random.choice(np.where(y==0)[0], 250, replace=True)))

        X_sample = X[idx]
        y_sample = y[idx]
        X_train, X_test, Y_train, Y_test = train_test_split(X_sample,y_sample,test_size=0.25)
        
        clf_linear = SVC(kernel='linear')
        clf_linear.fit(X_train, Y_train)
        y_pred_linear = clf_linear.predict(X_test)
        F1[i, 0] = f1_score(Y_test, y_pred_linear)
        
        clf_gaussian = SVC(kernel='rbf')
        clf_gaussian.fit(X_train, Y_train)
        y_pred_gaussian = clf_gaussian.predict(X_test) 
        F1[i, 1] = f1_score(Y_test, y_pred_gaussian)
        
        clf_tan = SVC(kernel='sigmoid')
        clf_tan.fit(X_train, Y_train)
        y_pred_tan = clf_tan.predict(X_test)
        F1[i, 2] = f1_score(Y_test, y_pred_tan)

        clf_pol = SVC(kernel='poly')
        clf_pol.fit(X_train, Y_train)
        y_pred_pol = clf_pol.predict(X_test)
        F1[i, 3] = f1_score(Y_test, y_pred_pol)

    return F1

In [41]:
F1 = pd.DataFrame(kernel_bootstrap(X_train, Y_train, 1000))

100%|██████████| 1000/1000 [00:32<00:00, 30.92it/s]


In [42]:
F1

Unnamed: 0,0,1,2,3
0,0.637681,0.631579,0.623188,0.645161
1,0.560748,0.504505,0.595041,0.581081
2,0.597015,0.651163,0.545455,0.583333
3,0.538462,0.701493,0.527132,0.651852
4,0.566372,0.615385,0.551724,0.625000
...,...,...,...,...
995,0.578512,0.640625,0.538462,0.648649
996,0.550000,0.557377,0.434783,0.635135
997,0.455285,0.526316,0.481481,0.593103
998,0.597015,0.545455,0.519685,0.637500


In [43]:
F1_median = np.median(F1, axis=0)/np.sum(np.median(F1, axis=0))

In [44]:
F1_median

array([0.24953774, 0.26033599, 0.23032625, 0.25980002])

In [45]:
def combined(X,Y):
    com = F1_median[0]*linearKernell(X,Y) + F1_median[1]*gaussianKernel2(X,Y) 
    + F1_median[2]*tanhKernel(X, Y) + F1_median[3]*polynomial_kernel(X, Y)
    return com

In [None]:
lf_intent = SVC(kernel='sigmoid')
lf_intent.fit(X_train, Y_train)
y_pred_tan = lf_intent.predict(X_test)
F1[i, 2] = f1_score(Y_test, y_pred_tan)

ValueError: Expected 2D array, got 1D array instead:
array=[1. 1. 1. ... 1. 1. 1.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [46]:
clf_custom = SVC(kernel=combined)
clf_custom.fit(X_train, Y_train)
y_pred = clf_custom.predict(X_test)
print(classification_report(Y_test, y_pred))

KeyboardInterrupt: 