In [1]:
import pandas as pd
from statistics import mean
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from imblearn.under_sampling import RandomUnderSampler

from sklearn import svm

from sklearn.metrics import roc_auc_score

#from sklearn.feature_selection import SequentialFeatureSelector
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,Pair,Source,Sink,NCA,Exist,CN,AA,RA,JC,PA,KI,PR_s1,PR_s2
0,"(0, 356)",0,356,14,1,7,2.899858,0.628968,0.7,72,0.079962,0.00022,0.00024
1,"(0, 1236)",0,1236,14,1,6,2.471649,0.531746,0.428571,96,0.075137,0.00022,0.000302
2,"(356, 1236)",356,1236,14,1,7,2.812086,0.587302,0.5,108,0.074232,0.00024,0.000302
3,"(0, 1655)",0,1655,9,1,7,2.976054,0.668651,0.466667,112,0.083302,0.00022,0.000376
4,"(0, 1797)",0,1797,4,1,7,2.899858,0.628968,0.7,72,0.081045,0.00022,0.000245


In [3]:
X = train[['NCA', 'CN', 'AA', 'RA', 'JC', 'PA', 'KI', 'PR_s1', 'PR_s2']]
y = train['Exist']

In [4]:
# balance the dataset by undersampling
rus = RandomUnderSampler(sampling_strategy="majority")
X, y= rus.fit_resample(X, y)

In [5]:
y.value_counts()

0    16034
1    16034
Name: Exist, dtype: int64

In [6]:
# perform 5 fold cross validation
n_splits=5
kf = KFold(n_splits=n_splits, shuffle=True)

clf = svm.SVC(probability=True, max_iter=15000)
auc = []
for train_index, test_index in kf.split(X):
    # split the train data and test data
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    
    y_pred = clf.fit(X_train, y_train).predict_proba(X_test)
    auc.append(roc_auc_score(y_test, y_pred[:,1]))

In [7]:
y_pred

array([[0.81721598, 0.18278402],
       [0.7936248 , 0.2063752 ],
       [0.04286627, 0.95713373],
       ...,
       [0.679241  , 0.320759  ],
       [0.8007747 , 0.1992253 ],
       [0.04280751, 0.95719249]])

In [8]:
print("The average AUC is:", mean(auc))

The average AUC is: 0.8659121093519531


In [9]:
# feature selection
# sbs = SFS(clf, k_features=4, forward=False, floating=False, cv=0)
# sbs.fit(X, y)
# sbs.k_feature_names_

# output: ('NCA', 'CN', 'AA', 'RA')

In [10]:
# fit a model with selected features
# X = train[['NCA', 'CN', 'AA', 'RA']]

# auc=[]
# for train_index, test_index in kf.split(y):
    # split the train data and test data
    #X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    #y_train, y_test = y[train_index], y[test_index]
    
    #y_pred = clf.fit(X_train, y_train).predict_proba(X_test)
    #auc.append(roc_auc_score(y_test, y_pred[:,1]))

In [11]:
#y_pred

In [12]:
#print("The average AUC after feature selection is:", mean(auc))