In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.colors import LogNorm
from matplotlib.backends.backend_pdf import PdfPages
import copy
import matplotlib
from scipy.stats import entropy
import scipy.stats as stats
import glob, os
from matplotlib.ticker import MaxNLocator
from statistics import mean, stdev

In [3]:
df = pd.read_csv('Motif_counts/trans.txt_86400_2_2', skiprows=3, header=None, dtype=str)
df1 = pd.read_csv('Motif_counts/trans.txt_86400_2_3', skiprows=3, header=None, dtype=str)
df2 = pd.read_csv('Motif_counts/trans.txt_86400_3_2', skiprows=3, header=None, dtype=str)
df3 = pd.read_csv('Motif_counts/trans.txt_86400_3_3', skiprows=3, header=None, dtype=str)
df = pd.concat([df,df1,df2,df3])

In [4]:
df.columns = ['u','v','t','motif','count']
df['t'] = df['t'].astype(int)

In [5]:
label = pd.read_csv('Data/transactions.csv')

In [6]:
df = pd.merge(df,label,how="left",on=['U','V','t'])

In [7]:
print(df[df['friend']==0].shape)
print(df[df['friend']==1].shape)

(27530, 7)
(201356, 7)


In [8]:
F = df[df['friend']==1].reset_index(drop=True)
N = df[df['friend']==0].reset_index(drop=True)

In [9]:
motifs = pd.read_csv('Motif_counts/motifs.csv', dtype=str)
digit = motifs['motif'].values
ep = motifs['label'].values

# classification

In [9]:
digit_2e = digit[:6]
ep_2e = ep[:6]
digit_3e = digit[6:]
ep_3e = ep[6:]

In [10]:
# pivot to feature table
df = df[df['motif'].isin(digit)]
df = df.dropna().drop_duplicates().reset_index(drop=True)
df = df.iloc[df[['u','v','t','motif']].drop_duplicates().index].reset_index(drop=True)
df = df.pivot(index=["u","v","t"],columns=["motif"],values="count")
df.index = df.index.set_names(['u', 'v', 't'])
df.reset_index(inplace=True)
df = df.fillna(0)

# transactions data
trans = label.iloc[label[['u','v','t']].drop_duplicates().index].reset_index(drop=True)
trans = trans.dropna().reset_index(drop=True)

In [11]:
print(df.shape)
print(trans.shape)

(52021, 45)
(105539, 5)


In [12]:
# outer merge
trans = pd.merge(trans, df, how="left")
trans.shape

(105539, 47)

In [13]:
for d in digit:
    if d not in trans.columns:
        trans[d] = np.zeros(trans.shape[0]).astype(int)

In [14]:
# in and out merge data
in_data = trans.dropna().reset_index(drop=True)
out_data = trans.fillna(0)

In [16]:
def process_data(input_data):
    y = input_data['friend'].values
#     y = input_data['friend'].values
    x2 = []
    x3 = []
    xall = []
    for i in range(input_data.shape[0]):
        temp2 = []
        temp3 = []
        temp = []
        for m2 in digit_2e:
            temp2.append(input_data[m2][i])            
        for m3 in digit_3e:
            temp3.append(input_data[m3][i])
        for mall in digit:
            temp.append(input_data[mall][i])
        x2.append(temp2)
        x3.append(temp3)
        xall.append(temp)
    x2 = np.array(x2).astype(int)
    x3 = np.array(x3).astype(int)
    xall = np.array(xall).astype(int)
    return y, x2, x3, xall

In [70]:
Y, X2, X3, X = process_data(in_data)
# Y, X2, X3, X = process_data(out_data)

In [18]:
Y.shape

(52021,)

# Heuristic Scores

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

logisticRegr = LogisticRegression(solver='lbfgs')
logisticRegr.fit(X_train, y_train)
y_predict = logisticRegr.predict(X_test)
print(roc_auc_score(y_test,y_predict))
print(f1_score(y_test,y_predict))
print(precision_score(y_test,y_predict))
print(recall_score(y_test,y_predict))

0.5018625981978931
0.9243245479746783
0.8593629789198338
0.9999104824993286


In [25]:
import networkx as nx
df = in_data[['u','v','friend']].drop_duplicates().reset_index(drop=True)
G = nx.from_pandas_edgelist(df, source='u', target='v')

In [28]:
label = df['friend'].values
E = []
for i in range(df.shape[0]):
    E.append((df['u'][i], df['v'][i]))

In [59]:
jc = nx.jaccard_coefficient(G, E)
aa = nx.adamic_adar_index(G, E)

In [60]:
JC = []
for u, v, p in jc:
    JC.append(p)

AA = []
for u, v, p in aa:
    AA.append(p)

In [68]:
X = np.array(JC).reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.25)

logisticRegr = LogisticRegression(solver='lbfgs')
logisticRegr.fit(X_train, y_train)

y_predict = logisticRegr.predict(X_test)
print(precision_score(y_test,y_predict))
print(recall_score(y_test,y_predict))
print(roc_auc_score(y_test,y_predict))
print(f1_score(y_test,y_predict))

0.817591763652641
1.0
0.5
0.8996429011205517


In [75]:
X = np.array(AA).reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.25)

logisticRegr = LogisticRegression(solver='lbfgs')
logisticRegr.fit(X_train, y_train)

y_predict = logisticRegr.predict(X_test)
print(precision_score(y_test,y_predict))
print(recall_score(y_test,y_predict))
print(roc_auc_score(y_test,y_predict))
print(f1_score(y_test,y_predict))

0.8303491495076096
1.0
0.5
0.9073123012961604


# XGBoost

In [72]:
from xgboost import XGBClassifier

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.25)
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_predict = xgb.predict(X_test)

In [82]:
print(precision_score(y_test,y_predict))
print(recall_score(y_test,y_predict))
print(roc_auc_score(y_test,y_predict))
print(f1_score(y_test,y_predict))

0.8321364452423698
0.9986533800161594
0.5039624515974838
0.9078222548659567


# LR

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

In [25]:
sum(y_test)/len(y_test)

0.859141934491773

In [26]:
logisticRegr = LogisticRegression(solver='lbfgs')
logisticRegr.fit(X_train, y_train)
y_predict = logisticRegr.predict(X_test)
print(roc_auc_score(y_test,y_predict))
print(f1_score(y_test,y_predict))
print(precision_score(y_test,y_predict))
print(recall_score(y_test,y_predict))

0.5013198820874722
0.9243815669727807
0.8594615384615385
0.999910506533023


# SVM

In [27]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [28]:
clf = SVC(kernel='rbf',gamma='scale').fit(X_train, y_train)
y_predict = clf.predict(X_test)

In [29]:
print(roc_auc_score(y_test,y_predict))
print(f1_score(y_test,y_predict))
print(precision_score(y_test,y_predict))
print(recall_score(y_test,y_predict))

0.5018657336158565
0.9244580506371007
0.8595937836590245
0.999910506533023


# RF

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)

In [32]:
print(roc_auc_score(y_test,y_predict))
print(f1_score(y_test,y_predict))
print(precision_score(y_test,y_predict))
print(recall_score(y_test,y_predict))

0.5107599773491645
0.9220973782771535
0.8617766023646546
0.9914981206371934
