In [42]:
import pandas as pd
from statistics import mean
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold

from imblearn.under_sampling import RandomUnderSampler

from sklearn import svm

from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [43]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,Pair,Source,Sink,NCA,Exist,CN,AA,RA,JC,PA,...,n2v_11,n2v_12,n2v_13,n2v_14,n2v_15,n2v_16,n2v_17,n2v_18,n2v_19,n2v_20
0,"(0, 356)",0,356,14,1,7,2.899858,0.628968,0.7,72,...,-0.496998,-1.134546,0.00798,0.987995,0.972931,0.094051,-0.802513,-0.525578,-0.045362,-0.167199
1,"(0, 1236)",0,1236,14,1,6,2.471649,0.531746,0.428571,96,...,-0.416841,-1.014621,-0.016673,1.101111,0.970056,-0.027354,-0.736977,-0.429805,-0.006636,-0.157095
2,"(356, 1236)",356,1236,14,1,7,2.812086,0.587302,0.5,108,...,-0.38117,-1.030291,0.043748,1.10478,0.930035,-0.076307,-0.804798,-0.361534,0.00762,-0.119622
3,"(0, 1655)",0,1655,9,1,7,2.976054,0.668651,0.466667,112,...,-0.502501,-0.887524,-0.000751,0.69217,0.94091,0.338876,-0.834912,-0.660168,-0.042238,-0.252496
4,"(0, 1797)",0,1797,4,1,7,2.899858,0.628968,0.7,72,...,-0.535481,-1.112927,-0.095884,0.932938,0.996588,0.147873,-0.821845,-0.635077,-0.034613,-0.177429


## Similarity based Features

In [44]:
sbF = ['NCA', 'CN', 'AA', 'RA', 'JC', 'PA', 'KI', 'PR_s1', 'PR_s2']
X = train[sbF]
y = train['Exist']

In [45]:
# balance the dataset by undersampling
rus = RandomUnderSampler(sampling_strategy="majority")
X, y= rus.fit_resample(X, y)

In [46]:
y.value_counts()

0    16036
1    16036
Name: Exist, dtype: int64

In [34]:
# perform 5 fold cross validation
n_splits=5
sss = StratifiedShuffleSplit(n_splits=n_splits, random_state=0)

clf = svm.SVC(probability=True, max_iter=20000)
auc = []
for train_index, test_index in sss.split(X, y):
    # split the train data and test data
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    
    y_pred = clf.fit(X_train, y_train).predict_proba(X_test)
    auc.append(roc_auc_score(y_test, y_pred[:,1]))

In [35]:
y_pred

array([[0.74709956, 0.25290044],
       [0.80173427, 0.19826573],
       [0.11428419, 0.88571581],
       ...,
       [0.51375145, 0.48624855],
       [0.08688966, 0.91311034],
       [0.40715006, 0.59284994]])

In [36]:
print("The average AUC is:", mean(auc))

The average AUC is: 0.8562133864217263


## Node2Vec Features

In [47]:
d=20
n2vF = ["n2v_"+str(i+1) for i in range(d)]
X = train[n2vF]
y = train['Exist']

# balance the dataset by undersampling
rus = RandomUnderSampler(sampling_strategy="majority")
X, y= rus.fit_resample(X, y)

y.value_counts()

0    16036
1    16036
Name: Exist, dtype: int64

In [48]:
# Fitting a model with all Node2Vec features
n_splits=3
sss = StratifiedShuffleSplit(n_splits=n_splits, random_state=0)

clf = svm.SVC(probability=True, max_iter=20000)
auc = []
for train_index, test_index in sss.split(X, y):
    # split the train data and test data
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    
    y_pred = clf.fit(X_train, y_train).predict_proba(X_test)
    auc.append(roc_auc_score(y_test, y_pred[:,1]))

In [49]:
y_pred

array([[9.99999672e-01, 3.28326624e-07],
       [3.62779907e-07, 9.99999637e-01],
       [9.99999900e-01, 1.00000010e-07],
       ...,
       [3.88199098e-08, 9.99999961e-01],
       [9.99999900e-01, 1.00000010e-07],
       [1.33994639e-05, 9.99986601e-01]])

In [50]:
print("The average AUC is:", mean(auc))

The average AUC is: 0.9985653851655152


## Prediction on test-public

In [53]:
# with Node2Vec Features
test = pd.read_csv("test.csv", index_col = 'Id')
X_test = test[n2vF]
y_pred = clf.fit(X, y).predict_proba(X_test)
y_pred[:,1]

array([1.04126208e-02, 1.00000010e-07, 1.00000010e-07, ...,
       5.35222658e-05, 9.99997160e-01, 1.00000010e-07])

In [54]:
test["Predicted"] = y_pred[:,1]
test.head()

Unnamed: 0_level_0,Source,Sink,Pair,CN,AA,RA,JC,PA,KI,PR_s1,...,n2v_12,n2v_13,n2v_14,n2v_15,n2v_16,n2v_17,n2v_18,n2v_19,n2v_20,Predicted
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,2917,"(0, 2917)",0,0.0,0.0,0.0,56,-0.000552,0.00022,...,-0.989594,0.31825,1.025849,1.072989,-0.084099,-0.696392,-0.338227,-0.114178,-0.235864,0.01041262
2,0,2956,"(0, 2956)",0,0.0,0.0,0.0,24,-8.8e-05,0.00022,...,-1.120674,0.378106,0.360288,1.130361,0.871391,-0.439986,-0.701735,-0.003102,-0.748156,1e-07
3,1,4038,"(1, 4038)",0,0.0,0.0,0.0,496,0.009896,0.000291,...,0.112067,0.327012,0.046739,0.370075,0.209226,0.686743,-1.708898,-0.471961,-0.802962,1e-07
4,2,1848,"(2, 1848)",2,1.24267,0.4,0.08,72,0.057307,0.000549,...,-0.774672,-0.249583,0.097737,1.224068,0.025562,-0.401903,-0.879856,-1.382236,-1.114386,0.9961261
5,3,513,"(3, 513)",0,0.0,0.0,0.0,391,-0.138788,0.000362,...,-0.071756,0.850157,0.056224,0.033808,0.243672,0.351791,-0.461942,-0.055407,-0.76175,1e-07


In [55]:
# save result as csv
test[['Predicted']].to_csv('result_SVM.csv')

In [12]:
# feature selection
# sbs = SFS(clf, k_features=4, forward=False, floating=False, cv=0)
# sbs.fit(X, y)
# sbs.k_feature_names_

# output: ('NCA', 'CN', 'AA', 'RA')

In [13]:
# fit a model with selected features
# X = train[['NCA', 'CN', 'AA', 'RA']]

# auc=[]
# for train_index, test_index in kf.split(y):
    # split the train data and test data
    #X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    #y_train, y_test = y[train_index], y[test_index]
    
    #y_pred = clf.fit(X_train, y_train).predict_proba(X_test)
    #auc.append(roc_auc_score(y_test, y_pred[:,1]))

In [14]:
#y_pred

In [15]:
#print("The average AUC after feature selection is:", mean(auc))