In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import networkx as nx
import scipy.sparse as sp
from scipy.stats import randint as sp_randint
from sklearn  import preprocessing
from sklearn.preprocessing  import StandardScaler
from sklearn.metrics import accuracy_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from time import time
from sklearn.decomposition import PCA
from sklearn import svm

### train-validation

In [2]:
x_train_final = pd.read_csv("features/X_train_train_10.csv")
x_test_final = pd.read_csv("features/X_train_valid_10.csv")
print("train： %s, test： %s" % (len(x_train_final),len(x_test_final)))

y_train_final = x_train_final['link']
y_test_final = x_test_final['link']
x_train_final.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)
x_test_final.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)

train： 22910, test： 5728


### train-test

In [3]:
x_train_final2 = pd.read_csv("features/X_train_10.csv")
x_test_final2 = pd.read_csv("features/X_test_10.csv")
print("train： %s, test： %s" % (len(x_train_final2),len(x_test_final2)))

y_train_final2 = x_train_final2['link']
y_test_final2 = x_test_final2['link']
x_train_final2.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)
x_test_final2.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)

train： 28638, test： 12276


### train-predict

In [4]:
x_train_final3 = pd.read_csv("features/train_all_10.csv")
x_pred_final = pd.read_csv("features/predict_all_10.csv")
print("train： %s, predict： %s" % (len(x_train_final3),len(x_pred_final)))

y_train_final3 = x_train_final3['link']

x_train_final3.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)
x_pred_final.drop(['node1', 'node2','link','shortest_path'],axis=1,inplace=True)

train： 40914, predict： 10231


# model (random forest)

In [5]:
#train model
start_time = time()
param_dist = {"n_estimators":sp_randint(100,150),
              "max_depth": sp_randint(10,20)}

clf = RandomForestClassifier(random_state=25,n_jobs=-1, oob_score=True)
rf_random = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=5,cv=10,scoring='accuracy',random_state=25)

rf_random.fit(x_train_final, y_train_final)
print('mean test scores','\n',rf_random.cv_results_['mean_test_score'])
print("--- %s seconds ---" % (time() - start_time))

mean test scores 
 [0.98092536 0.98118725 0.98101266 0.98096901 0.9812309 ]
--- 62.24066996574402 seconds ---


In [6]:
# build model with the best parameters
clf = rf_random.best_estimator_
print(rf_random.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=14, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=105,
                       n_jobs=-1, oob_score=True, random_state=25, verbose=0,
                       warm_start=False)


In [7]:
clf.fit(x_train_final, y_train_final)
print (clf.oob_score_)

y_pred = clf.predict(x_test_final)
print ("Validation accuracy score: " ,accuracy_score(y_test_final, y_pred))

0.9813182016586643
Validation accuracy score:  0.9725907821229051


In [8]:
clf.fit(x_train_final2, y_train_final2)
print (clf.oob_score_)

y_pred = clf.predict(x_test_final2)
print('Testing accuracy score: ',accuracy_score(y_test_final2, y_pred))

0.9804804804804805
Testing accuracy score:  0.9625285109156076


In [9]:
clf.fit(x_train_final3, y_train_final3)
print (clf.oob_score_)

y_pred = clf.predict(x_pred_final)

0.9834042137165763


In [10]:
sum(y_pred)

5580.0

In [11]:
answer = pd.read_csv("ans_example.csv")
answer['ans'] = [int(i) for i in y_pred] # turn float to int type
answer.to_csv("ans_30_rf_node2vec.csv", index=False )

# model (kernel SVM using PCA feature extraction)

In [12]:
def train_test_std(x_train_final, x_test_final):
    scaler = preprocessing.StandardScaler().fit(x_train_final)
    x_train_final_std = pd.DataFrame(scaler.transform(x_train_final.values), columns=x_train_final.columns)
    x_test_final_std = pd.DataFrame(scaler.transform(x_test_final.values), columns=x_test_final.columns)
    return x_train_final_std, x_test_final_std

##### train-validation

In [13]:
x_train_final_std, x_test_final_std = train_test_std(x_train_final, x_test_final)

In [14]:
#PCA
pca = PCA(n_components=6)
x_train_pca = pca.fit_transform(x_train_final_std)
x_test_pca = pca.transform(x_test_final_std)

clf = svm.SVC(kernel='rbf', gamma=2)
clf.fit(x_train_pca ,  y_train_final)
svm_pred = clf.predict(x_test_pca)
print('Validation accuracy score: ',accuracy_score(y_test_final, svm_pred))

Validation accuracy score:  0.9626396648044693


In [15]:
#std
clf = svm.SVC(kernel='rbf', gamma=2)
clf.fit(x_train_final_std,  y_train_final)
svm_pred = clf.predict(x_test_final_std)
print('Validation accuracy score: ',accuracy_score(y_test_final, svm_pred))

Validation accuracy score:  0.9612430167597765


##### train-test

In [16]:
x_train_final_std, x_test_final_std = train_test_std(x_train_final2, x_test_final2)

In [17]:
#PCA
pca = PCA(n_components=6)
x_train_pca = pca.fit_transform(x_train_final_std)
x_test_pca = pca.transform(x_test_final_std)

clf = svm.SVC(kernel='rbf', gamma=2)
clf.fit(x_train_pca ,  y_train_final2)
svm_pred = clf.predict(x_test_pca)
print('Test accuracy score: ',accuracy_score(y_test_final2, svm_pred))

Test accuracy score:  0.9625285109156076


In [18]:
#std
clf = svm.SVC(kernel='rbf', gamma=2)
clf.fit(x_train_final_std,  y_train_final2)
svm_pred = clf.predict(x_test_final_std)
print('Test accuracy score: ',accuracy_score(y_test_final2, svm_pred))

Test accuracy score:  0.9625285109156076


##### train_all-predict

In [19]:
x_train_final_std, x_pred_final_std = train_test_std(x_train_final3, x_pred_final)

In [20]:
#PCA
pca = PCA(n_components=6)
x_train_pca = pca.fit_transform(x_train_final_std)
x_pred_pca = pca.transform(x_pred_final_std)

clf = svm.SVC(kernel='rbf', gamma=2)
clf.fit(x_train_pca ,  y_train_final3)
svm_pred = clf.predict(x_pred_pca)
print("# of rows: %d, # of 1 predictions: %d" % (len(svm_pred), sum(svm_pred)))

# of rows: 10231, # of 1 predictions: 5544


In [21]:
#std
clf = svm.SVC(kernel='rbf', gamma=2)
clf.fit(x_train_final_std,  y_train_final3)
svm_pred = clf.predict(x_pred_final_std)
print("# of rows: %d, # of 1 predictions: %d" % (len(svm_pred), sum(svm_pred)))

# of rows: 10231, # of 1 predictions: 5626


In [22]:
answer = pd.read_csv("ans_example.csv")
answer['ans'] = [int(i) for i in svm_pred] # turn float to int type
answer.to_csv("ans_31_ksvm.csv", index=False )