In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import ast

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


In [None]:
train_model = pd.read_csv('rank_train_set.csv')
train_model.set_index('Id')
train_f5 = pd.DataFrame(train_model[['common_interest', 'common_friends', 'distance','source_degree','sink_degree','label']])
X = np.array(train_f5[['common_interest', 'common_friends', 'distance','source_degree','sink_degree']])
y = np.array(train_f5.label)
print(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
# logistic regression validation with l2 regularization
clf = LogisticRegression(penalty='l2')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('validation accuracy is', accuracy_score(y_test, y_pred))

In [None]:
# load test file 
df = pd.read_csv('rank_test_public.csv')
test = df[['common_interest', 'common_friends', 'distance','source_degree','sink_degree']]

In [None]:
# predict logistic regression+regularization probability on 2000 test set
prob = list(clf.predict_proba(test))

# find the number of positive edges
n_logistic = 0 #1027
for i in clf.predict_proba(test)[:,1]:
    if i >= 0.5:
        n_logistic+=1

print(n_logistic, 'out of 2000 are classified as label 1')

# extract label 1 probabilities
l_lr = []
for i in prob:
    l_lr.append(i[1])
index = list(range(1,2001))
data = {'Id':index, 'Predicted':l_lr}
logistic_try_v1 = pd.DataFrame(data)
logistic_try_v1 = logistic_try_v1.set_index('Id')
logistic_try_v1.to_csv('logistic_try_v1.csv')


In [None]:
# Gradient Boosting classifier
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier(random_state=10)
GB.fit(X_train, y_train)
y_pred_GB = GB.predict(X_test)

print('validation accuracy is', accuracy_score(y_test, y_pred_GB))

prob_GB = list(GB.predict_proba(test))
n_GB = 0 #1077
for i in GB.predict_proba(test)[:,1]:
    if i >= 0.5:
        n_GB+=1
print(n_GB, 'of 2000 are classified as label 1')

l_GB = []
for i in prob_GB:
    l_GB.append(i[1])
index = list(range(1,2001))
data_GB = {'Id':index, 'Predicted':l_GB}
GB_v1 = pd.DataFrame(data_GB)
GB_v1 = GB_v1.set_index('Id')
#GB_v1.to_csv('GB_v1.csv')
    

In [None]:
# tuning GB parameters
from sklearn.model_selection import GridSearchCV #Performing grid search

predictors = X
param_test1 = {'n_estimators':range(10,81,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=150,min_samples_leaf=50,max_depth=5,max_features='sqrt',random_state=10), 
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=10, refit=True)
predictors = ['common_interest', 'common_friends', 'distance','source_degree','sink_degree']
target = ['label']
gsearch1.fit(train_f5[predictors],train_f5[target])

In [None]:
gsearch1.best_params_, gsearch1.best_score_

In [None]:
# mini_samples_split normally is 0.5-1% of total values
param_test2 = {'max_depth':range(1,6,1), 'min_samples_split':range(20,251,20)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40, max_features='sqrt', random_state=10), 
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=10, refit=True)
gsearch2.fit(train_f5[predictors],train_f5[target])



In [None]:
gsearch2.best_params_, gsearch2.best_score_

In [None]:
param_test3 = {'min_samples_split':range(20,251,20), 'min_samples_leaf':range(1,71,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40,max_depth=5,max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=10)
gsearch3.fit(train_f5[predictors],train_f5[target])


In [None]:
gsearch3.best_params_, gsearch3.best_score_

In [None]:
GB_tune = gsearch3.best_estimator_
GB_tune.fit(X_train, y_train)
y_pred_GB_tune = GB_tune.predict(X_test)
print('validation accuracy is', roc_auc_score(y_test, y_pred_GB_tune))

prob_GB_tune = list(GB_tune.predict_proba(test))
n_GB_tune = 0 #1032
for i in GB_tune.predict_proba(test)[:,1]:
    if i >= 0.5:
        n_GB_tune += 1
        
print(n_GB_tune, 'of 2000 are classified as label 1')

l_GB_tune = []
for i in prob_GB_tune:
    l_GB_tune.append(i[1])
index = list(range(1,2001))
data_GB_tune = {'Id':index, 'Predicted':l_GB_tune}
GB_v1 = pd.DataFrame(data_GB_tune)
GB_v1 = GB_v1.set_index('Id')
#GB_v1.to_csv('GB_v2.csv')

In [None]:
param_test4 = {'max_features':range(1,6,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40,max_depth=5, min_samples_split=20, min_samples_leaf=1, random_state=10),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=10)
gsearch4.fit(train_f5[predictors],train_f5[target])


In [None]:
gsearch4.best_params_, gsearch4.best_score_

In [None]:
########### try this GB ####################
gbm_tuned_1 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40,max_depth=5, min_samples_split=140,min_samples_leaf=1, random_state=10, max_features=5)
gbm_tuned_1.fit(X_train, y_train)
y_pred_GB_1 = gbm_tuned_1.predict(X_test)

print('validation accuracy is', roc_auc_score(y_test, y_pred_GB_1))

prob_gbm_1 = list(gbm_tuned_1.predict_proba(test))
n_gbm_1 = 0 #1055
for i in gbm_tuned_1.predict_proba(test)[:,1]:
    if i >= 0.5:
        n_gbm_1+=1
print(n_gbm_1, 'of 2000 are classified as label 1')