In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import random
import lightgbm as lgbm
from tqdm import tqdm
from sklearn.model_selection import train_test_split


In [2]:
result_list = []
sink_list = []
edges = {}
with open("train.txt", 'r') as f:
    for data in f:
        converted_data = data.split()
        
        for i in range(len(converted_data)-1):
            result_list.append([converted_data[0], converted_data[i+1]])
            edges[(converted_data[0], converted_data[i+1])] = 1
tw_df = pd.DataFrame(result_list, columns=["Source", "Sink"])

In [3]:
pos_sample = tw_df.sample(n = 200000)
tw_df_temp = tw_df.drop(index = pos_sample.index.values)
train_graph=nx.from_pandas_edgelist(tw_df_temp, "Source", "Sink", create_using=nx.DiGraph())

In [4]:

missing_edges = []
while (len(missing_edges)<23946602):
    if (len(missing_edges) >= 200000):
        break
    a=random.randint(1, 4867136)
    b=random.randint(1, 4867136)
    tmp = edges.get((a,b),-1)
    if tmp == -1 and a!=b:
       
        try:
            # adding points who less likely to be friends
            if nx.shortest_path_length(whole_graph,source=a,target=b) > 2: 

                missing_edges.append([a,b])
            else:
                continue  
        except:  
             missing_edges.append([a,b])           
    else:
        continue

In [5]:
tw_df_neg = pd.DataFrame(missing_edges, columns=['Source', 'Sink'])
neg_sample = tw_df_neg.sample(n = 200000)
pos_sample["Linked"] = np.ones(200000)
neg_sample["Linked"] = np.zeros(200000)
all_sample = pos_sample.append(neg_sample)

In [6]:
all_sample = all_sample.reset_index(drop=True)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(all_sample[["Source", "Sink"]], all_sample["Linked"], test_size = 0.1, random_state = 0)

From here, we start to generate some features for our models.

In [25]:
def calc_adar_in(a,b):
    sum=0
    try:
        n=list(set(train_graph.successors(a)).intersection(set(train_graph.successors(b))))
        if len(n)!=0:
            for i in n:
                sum=sum+(1/np.log10(len(list(train_graph.predecessors(i)))))
            return sum
        else:
            return 0
    except:
        return 0

In [26]:
X_train["num_followers_d"] = ""
X_train['num_followees_s'] = ""
X_train['num_followees_d'] = ""
X_train['inter_followers'] = ""
X_train['inter_followees'] = ""
X_test['num_followers_s'] = ""
X_test['num_followers_d'] = ""
X_test['num_followees_s'] = ""
X_test['num_followees_d'] = ""
X_test['inter_followers'] = ""
X_test['inter_followees'] = ""

In [27]:
def compute_features_stage1(df_final):
    #calculating no of followers followees for source and destination
    #calculating intersection of followers and followees for source and destination
    num_followers_s=[]
    num_followees_s=[]
    num_followers_d=[]
    num_followees_d=[]
    inter_followers=[]
    inter_followees=[]
    for i,row in df_final.iterrows():
        try:
            s1=set(train_graph.predecessors(row['Source']))
            s2=set(train_graph.successors(row['Source']))
        except:
            s1 = set()
            s2 = set()
        try:
            d1=set(train_graph.predecessors(row['Sink']))
            d2=set(train_graph.successors(row['Sink']))
        except:
            d1 = set()
            d2 = set()
        num_followers_s.append(len(s1))
        num_followees_s.append(len(s2))

        num_followers_d.append(len(d1))
        num_followees_d.append(len(d2))

        inter_followers.append(len(s1.intersection(d1)))
        inter_followees.append(len(s2.intersection(d2)))
    
    return num_followers_s, num_followers_d, num_followees_s, num_followees_d, inter_followers, inter_followees
X_train['num_followers_s'], X_train['num_followers_d'], X_train['num_followees_s'], X_train['num_followees_d'], X_train['inter_followers'], X_train['inter_followees']= compute_features_stage1(X_train)
X_test['num_followers_s'], X_test['num_followers_d'], X_test['num_followees_s'], X_test['num_followees_d'], X_test['inter_followers'], X_test['inter_followees']= compute_features_stage1(X_test)

In [28]:
#mapping adar index on train
X_train['adar_index'] = X_train.apply(lambda row: calc_adar_in(row['Source'],row['Sink']),axis=1)
#mapping adar index on test
X_test['adar_index'] = X_test.apply(lambda row: calc_adar_in(row['Source'],row['Sink']),axis=1)

In [29]:
X_train.drop(['Source', 'Sink'],axis=1,inplace=True)
X_test.drop(['Source', 'Sink'],axis=1,inplace=True)
d_train = lgbm.Dataset(X_train, label=y_train)

d_test=lgbm.Dataset(X_test,label=y_test)

We also try a lightGBM model.

In [31]:
parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'num_threads' : 2,
    'seed' : 76
}

parameters1 = {
                'max_depth':10, # crtical parameter
            'num_leaves': 800, # critical parameter, must be < 2^max_depth
            'min_data_in_leaf': 3000, # critical parameter, avoid over-fitting
    
            'max_bin': 1000,  
            'learning_rate': 0.1, # small rate with large iteration
            'num_iterations': 1000,
    
            'objective': 'binary', # don't change
            'feature_fraction': 0.9, # don't change, avoid over-fitting
            'verbose': -1, # don't' change
            'metric': 'auc', # don't change
}

clf_lightgbm = lgbm.train(parameters, d_train, valid_sets = d_test, num_boost_round=1000,
                   early_stopping_rounds=20)


[LightGBM] [Info] Number of positive: 179853, number of negative: 180147
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5879
[LightGBM] [Info] Number of data points in the train set: 360000, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499592 -> initscore=-0.001633
[LightGBM] [Info] Start training from score -0.001633
[1]	valid_0's auc: 0.940016
Training until validation scores don't improve for 20 rounds
[2]	valid_0's auc: 0.940016
[3]	valid_0's auc: 0.940016
[4]	valid_0's auc: 1
[5]	valid_0's auc: 1
[6]	valid_0's auc: 1
[7]	valid_0's auc: 1
[8]	valid_0's auc: 1
[9]	valid_0's auc: 1
[10]	valid_0's auc: 1
[11]	valid_0's auc: 1
[12]	valid_0's auc: 1
[13]	valid_0's auc: 1
[14]	valid_0's auc: 1
[15]	valid_0's auc: 1
[16]	valid_0's auc: 1
[17]	valid_0's auc: 1
[18]	valid_0's auc: 1
[19]	valid_0's auc: 1
[20]	valid_0's auc: 1
[21]	valid_0's auc: 1
[22]	vali