In [None]:
# Imports
import networkx as nx
import pandas as pd
import numpy as np
from numba import autojit, prange
import time
import pickle
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 40)
import hyperopt
from hyperopt import hp, tpe, STATUS_OK, Trials
from sklearn.metrics import roc_auc_score,auc,roc_curve
import xgboost as xgb
from sklearn.model_selection import KFold,StratifiedKFold
from collections import Counter
import sys
from os.path import dirname
import lightgbm as lgb

In [None]:
# read training data
df=pd.read_csv('train.csv')

In [None]:
# read test data
dff=pd.read_csv('test.csv')

In [None]:
# group both train and test to make a combined network
df=df[['node1_id','node2_id']].append(dff[['node1_id','node2_id']])

In [None]:
# add nodes from merged dataframe to initialized graph
g = nx.Graph()
g.add_nodes_from(df.node1_id)
g.add_nodes_from(df.node2_id)

In [None]:
# add edges to graph
edges = list(df[['node1_id', 'node2_id']].to_records(index=False))
g.add_edges_from(edges)

In [None]:
# get degree to calculate nodes
print('Number of unique questions:', len(set(df.node1_id) | set(df.node2_id)), g.number_of_nodes())
print('Number of rows in the data:', len(df), g.number_of_edges())

d = g.degree()

In [None]:
# create dictionary to store neighbour count for every node
dd={}
for k in d:
    dd[k[0]]=k[1]

In [None]:
# create number of neighbours feature for both nodes
comb = pd.DataFrame()
comb['node1_neighbor_count'] = df['node1_id'].apply(lambda x:dd[x])
comb['node2_neighbor_count'] = df['node2_id'].apply(lambda x:dd[x])

In [None]:
# add difference between neighbours of nodes as a feature
comb['diff']=abs(comb['node1_neighbor_count']-comb['node2_neighbor_count'])

In [None]:
# add ratio of number of neighbours of both nodes
comb['div']=(comb['node1_neighbor_count']/comb['node2_neighbor_count'])

In [None]:
comb['node1_id']=df['node1_id']
comb['node2_id']=df['node2_id']

In [None]:
# get common neighbours
def get_intersection_count(row):
    return(len(set(g.neighbors(row.node1_id)).intersection(set(g.neighbors(row.node2_id)))))
start=time.time()
comb['common']=df.apply(lambda row: get_intersection_count(row), axis=1)
end=time.time()
print(end-start)

In [None]:
# get pagerank features
start=time.time()
pg=nx.pagerank(g)
end=time.time()
print(end-start)
comb['pr_node1'] = comb.apply(lambda row: pg[row.node1_id], axis=1)
comb['pr_node2'] = comb.apply(lambda row: pg[row.node2_id], axis=1)

In [None]:
# get clustering features
start=time.time()
cl=nx.clustering(g)
end=time.time()
print(end-start)
comb['cluster_node1'] = comb.apply(lambda row: cl[row.node1_id], axis=1)
comb['cluster_node2'] = comb.apply(lambda row: cl[row.node2_id], axis=1)

In [None]:
# get resource allocation index to use as a feature
ra=nx.resource_allocation_index(g,edges)
lss=[]
@autojit
def parallel_sum(A,parallel=True):
    for i in ad:
        lss.append(i[2])

    return lss
start=time.time()
parallel_sum(ra)
end=time.time()
print(end-start) 
comb['ra']=lss

In [None]:
# calculate shortest path between nodes
ls=[]
start=time.time()
for j in edges:
    ls.append(nx.shortest_path_length(g,j[0],j[1]))
end=time.time()
print(end-start)
comb['dist']=ls

In [None]:
# calculate average degree for every node
start=time.time()
avgdegree=nx.average_neighbor_degree(g)
end=time.time()
print(end-start)
comb['avgdeg_node1'] = comb.apply(lambda row: avgdegree[row.node1_id], axis=1)
comb['avgdeg_node2'] = comb.apply(lambda row: avgdegree[row.node2_id], axis=1)

In [None]:
# calculate centrality for every node
start=time.time()
cen=nx.degree_centrality(g)
end=time.time()
print(end-start)
comb['degcen_node1'] = comb.apply(lambda row: cen[row.node1_id], axis=1)
comb['degcen_node2'] = comb.apply(lambda row: cen[row.node2_id], axis=1)

In [None]:
# add some more neighbour based features based on existing literature on network analysis
comb['mul']=comb['node1_neighbor_count']*comb['node2_neighbor_count']
comb['totalfriends']=comb['node1_neighbor_count']+comb['node2_neighbor_count']-2*comb['common']
comb['jaccard']=comb['common']/comb['totalfriends']
comb['SI']=comb['common']/(comb['node1_neighbor_count']+comb['node2_neighbor_count'])
comb['SC']=comb['common']/np.sqrt((comb['node1_neighbor_count']*comb['node2_neighbor_count']))
comb['HP']=comb['common']/np.minimum(comb['node1_neighbor_count'],comb['node2_neighbor_count'])
comb['HD']=comb['common']/np.maximum(comb['node1_neighbor_count'],comb['node2_neighbor_count'])
comb['PD']=comb['common']/comb['mul']

In [None]:
# add features based on user chat history
df=pd.read_csv('user_features.csv')
dff=pd.read_csv('train.csv')
prep1=pd.merge(df, dff, left_on='node_id', right_on='node1_id')
df1 = prep1.groupby(by = ['node1_id'])['is_chat'].agg(['sum','count']).reset_index()
df1.columns = ['node_id',"chat","conn"]
df2 = prep1.groupby(by = ['node2_id'])['is_chat'].agg(['sum','count']).reset_index()
df2.columns = ['node_id',"chat","conn"]
final = pd.concat([df1,df2],axis=0)
final = final.groupby(by = ['node_id']).sum().reset_index()
final['chat_conn'] = final.chat/final.conn
final['chat_activity'] = np.where(final.chat >3,final.chat_conn,0)
final['chat_cnt'] = np.where(final.chat >3,final.chat,0)
prepped_feat=final[['node_id','chat_activity','chat_cnt']]
userfeat=df.merge(prepped_feat,on='node_id',how='outer')
userfeat.to_csv('user_features_new.csv',index=False)

In [None]:
# data prep
train=pd.read_csv('train.csv')
user_feats=pd.read_csv('user_features_new.csv')
prep1=pd.merge(train, user_feats, left_on='node1_id', right_on='node_id')
prep2=pd.merge(prep1, user_feats, left_on='node2_id', right_on='node_id')
combo_tr=combo[0:train.shape[0]]
fin=combo_tr.merge(prep2,on=['node1_id','node2_id'])

In [None]:
# balance classes
train_pos = fin[fin.is_chat==1]
train_neg = fin[fin.is_chat!=1]
train_neg['r'] = np.random.rand(len(train_neg))
train_neg = train_neg[train_neg.r <= train_pos.shape[0]/fin.shape[0]]
train_neg.drop(columns = ['r'], inplace = True)
train_data = pd.concat([train_pos,train_neg], axis=0)

In [None]:
# Take out target variable
y=train_data['is_chat']

In [None]:
# default parameters
params = {
            'verbose_eval': True,
            "objective":"binary",
        'device':'cpu',
        "boosting":"gbdt",
    'boost_from_average' : False
    }
# lgtrain = lgb.Dataset(df, label=y)

In [None]:
# list of features to be used
cll=['f1_x', 'f2_x', 'f3_x',
       'f4_x', 'f5_x', 'f6_x', 'f7_x', 'f8_x', 'f9_x', 'f10_x', 'f11_x',
       'f12_x', 'f13_x', 'f1_y', 'f2_y', 'f3_y', 'f4_y', 'f5_y',
       'f6_y', 'f7_y', 'f8_y', 'f9_y', 'f10_y', 'f11_y', 'f12_y', 'f13_y','node1_neighbor_count', 'node2_neighbor_count','diff','div','dist','common','mul','totalfriends','jaccard','SI','SC','HP','HD','PD','cluster_node1','cluster_node2','pr_node1','pr_node2','ra','avgdeg_node1',
       'avgdeg_node2', 'degcen_node1', 'degcen_node2','chat_activity_x','chat_activity_y','chat_cnt_x','chat_cnt_y']

In [None]:
# final prepared training set
data=train_data[cll]

In [None]:
# define categorical features
cat_feat=[ 'f13_x','f13_y']

In [None]:
# initialize train set via lgb
lgtrain = lgb.Dataset(data, label=y,categorical_feature=cat_feat)

In [None]:
# cross validation
cv = lgb.cv(params,
                lgtrain,
                nfold=5,metrics='auc',
                num_boost_round=1500,
                early_stopping_rounds=50,stratified=True,shuffle=True,verbose_eval=10)

In [None]:
# train final model
model = lgb.train(params,lgtrain,num_boost_round=1250,verbose_eval=10)

In [None]:
# test data prep
test=pd.read_csv('test.csv')
prep1_test=pd.merge(test, user_feats, left_on='node1_id', right_on='node_id')
prep2_test=pd.merge(prep1_test, user_feats, left_on='node2_id', right_on='node_id')
combo_te=combo[train.shape[0]:]
fin_test=combo_te.merge(prep2_test,on=['node1_id','node2_id'])

In [None]:
# take out ids
ids=fin_test['id']

In [None]:
# predictions
preds=model.predict(fin_test[cll])

In [None]:
# dataframe for submission
op=pd.DataFrame()
op['id']=ids
op['is_chat']=preds

In [None]:
op.to_csv('pr.csv',index=False)