In [1]:
"""
Data Preprocessing
"""

import os
import re
import pandas as pd

# read train data
file_dir = '/Users/dnhb/PycharmProjects/SML_Ass1'
file_name = "train.txt"

def parse(data:list)->list: 
    # Python Notation: https://www.python.org/dev/peps/pep-3107/
    """
    Prase each line of the data
    Return a nested list, such as [[id1, id2, id2,...],...]
    """
    parsed_data = []
    for line in data:
        # remove \n at the end of each line
        line = re.sub(r"(?<=\d)\n", "", line)
        # split IDs by \t
        pattern = re.compile("(?<=\d)\t(?=\d)")
        line = re.split(pattern, line)
        parsed_data.append(line)
    return parsed_data

with open(os.path.join(file_dir, file_name)) as f:
    train_set = f.readlines()
    train_set = parse(train_set)
    
# read test data
test_name = "test-public.txt"
with open(os.path.join(file_dir, test_name)) as f:
    test_set = f.readlines()
    test_set = parse(test_set)
    test_set.pop(0)

# turn ID into int
for i in range(0, len(test_set)):
    test_set[i][0] = int(test_set[i][0])

In [6]:
# Similarity

train_dic = {}
for item in train_set:
    train_dic[item[0]] = item[1:]

def getSigmaOut(node:str, graph:dict) -> set:
    SigmaOut = set()
    try:
        for item in graph[node]:
            SigmaOut.add(item)
        return SigmaOut
    except:
        return SigmaOut

def common_neighbors1(a: str, b: str) -> float:
    SigmaA = getSigma(a)
    SigmaB = getSigma(b)
    return len(SigmaA.intersection(SigmaB))

def jaccard2(a:str, b:str) -> float:
    score = 0
    SigmaOutA = SigmaOut[a]
    if SigmaOutA:
        for node in SigmaOutA:
            score += jaccard1(b, node)
        return float(score/len(SigmaOutA))
    else:
        return 0

def common_neighbors2(a: str, b: str) -> float:
     score = 0
     SigmaOutA = SigmaOut[a]
     if SigmaOutA:
         for node in SigmaOutA:
             score += common_neighbors1(b, node)
         return float(score / len(SigmaOutA))
     else:
         return 0

def common_neighbors3(a: str, b: str) -> float:
     score = 0
     SigmaInB = SigmaIn[b]
     if SigmaInB:
         for node in SigmaInB:
             score += common_neighbors1(a, node)
         return float(score / len(SigmaInB))
     else:
         return 0

In [3]:
from collections import defaultdict

# construct Sigma Out dic
SigmaOut = defaultdict(set)

for kw in train_dic:
    SigmaOut[kw] = getSigmaOut(kw, train_dic)

# construct Sigma In dic
SigmaIn = defaultdict(set)

for kw in train_dic:
    for followee in train_dic[kw]:
        if SigmaIn[followee]:
            SigmaIn[followee].add(kw)
        else:
            SigmaIn[followee] = {kw}

def getSigma(node:str) -> set:
    return SigmaIn[node].union(SigmaOut[node])

In [None]:
## Cosine type 1
import multiprocessing as mp
import time
import copy as cp

train_dic = {}
for item in train_set:
    train_dic[item[0]] = item[1:]
    
if __name__ == '__main__':
    
    def getPro(record:list) -> list:
        node = [record[0], common_neighbors3(record[1], record[-1])]
        print(node)
        return node
         
    # construct the # of pools corresponding to the cpu_count in ur PC
    pool = mp.Pool(mp.cpu_count())
    
    startTime = time.time()
    
    CN1 = pool.map(getPro, test_set)
    pool.close()
    pool.join()
    
    endTime = time.time()
    print("Total time:" + (endTime - startTime).__str__())
    
    CN1_result = sorted(CN1, key=lambda x: x[-1], reverse=True)
    CN1_result = cp.deepcopy(CN1_result)
    for item in CN1_result:
        index = CN1_result.index(item)
        if index < len(CN1_result)/2:
            CN1_result[index][-1] = 1
        else:
            CN1_result[index][-1] = 0
    CN1_result = sorted(CN1_result, key=lambda x: x[0], reverse=False)

    title = ["Id", "Predicted"]
    test_pd = pd.DataFrame(columns = title, data = CN1_result)
    test_pd.to_csv('/Users/dnhb/PycharmProjects/SML_Ass1/CN1Label.csv',encoding='utf-8')
    test_pd = pd.DataFrame(columns = title, data = CN1)
    test_pd.to_csv('/Users/dnhb/PycharmProjects/SML_Ass1/CN1Score.csv',encoding='utf-8')

In [4]:
## Cosine type 2
import multiprocessing as mp
import time
import copy as cp


train_dic = {}
for item in train_set:
    train_dic[item[0]] = item[1:]
    
if __name__ == '__main__':
    
    def getPro(record:list) -> list:
        node = [record[0], common_neighbors2(record[1], record[-1])]
        print(node)
        return node
         
    # construct the # of pools corresponding to the cpu_count in ur PC
    pool = mp.Pool(mp.cpu_count())
    
    startTime = time.time()
    
    CN2 = pool.map(getPro, test_set)
    pool.close()
    pool.join()
    
    endTime = time.time()
    print("Total time:" + (endTime - startTime).__str__())
    
    CN2_result = sorted(CN2, key=lambda x: x[-1], reverse=True)
    CN2_result = cp.deepcopy(CN2_result)
    for item in CN2_result:
        index = CN2_result.index(item)
        if index < len(CN2_result)/2:
            CN2_result[index][-1] = 1
        else:
            CN2_result[index][-1] = 0
    CN2_result = sorted(CN2_result, key=lambda x: x[0], reverse=False)

    title = ["Id", "Predicted"]
    test_pd = pd.DataFrame(columns = title, data = CN2_result)
    test_pd.to_csv('/Users/dnhb/PycharmProjects/SML_Ass1/CN2Label.csv',encoding='utf-8')
    test_pd = pd.DataFrame(columns = title, data = CN2)
    test_pd.to_csv('/Users/dnhb/PycharmProjects/SML_Ass1/CN2Score.csv',encoding='utf-8')

[379, 14.772727272727273]
[1, 1.2380952380952381]
[442, 0.0603448275862069]
[190, 2.346613545816733]
[2, 0.43661971830985913]
[127, 6.026717557251908]
[3, 1.3414634146341464]
[128, 0.0]
[129, 3.433628318584071]
[380, 29.5]
[381, 7.2]
[316, 0.7641196013289037]
[317, 11.818181818181818]
[318, 11.634920634920634]
[130, 0.26]
[319, 60.08695652173913]
[382, 1.8239366963402572]
[383, 3.6016260162601625]
[4, 3.5197628458498023]
[5, 2.611111111111111]
[384, 32.16299559471366]
[6, 0.0]
[443, 0.19230769230769232]
[385, 5.78021978021978]
[320, 0.63671875]
[444, 70.59574468085107]
[7, 0.42201834862385323]
[445, 0.15977961432506887]
[446, 2.0]
[8, 0.9263803680981595]
[447, 0.0]
[448, 0.5757575757575758]
[191, 0.31208302446256486]
[131, 2.6477732793522266]
[132, 13.045454545454545]
[449, 0.6827956989247311]
[450, 0.6666666666666666]
[451, 2.0606060606060606]
[452, 1.7675438596491229]
[453, 0.7777777777777778]
[454, 14.693877551020408]
[9, 0.20824742268041238]
[10, 1.6161616161616161]
[455, 0.3223684

[206, 3.5294117647058822]
[361, 0.17904993909866018]
[362, 21.11111111111111]
[294, 31.88032]
[295, 4.104166666666667]
[296, 8.486486486486486]
[297, 38.79477611940298]
[207, 4.416184971098266]
[298, 8.068181818181818]
[299, 0.5714285714285714]
[363, 0.33436055469953774]
[364, 2.5576923076923075]
[208, 1.8821138211382114]
[300, 5.061349693251533]
[428, 0.03639933544781755]
[209, 5.914691943127962]
[365, 7.565663036515055]
[489, 263.7150684931507]
[366, 1.7936507936507937]
[367, 0.36220472440944884]
[210, 17.258278145695364]
[211, 15.853658536585366]
[368, 6.717171717171717]
[212, 10.61111111111111]
[301, 0.3266972945380296]
[302, 0.16]
[369, 17.975206611570247]
[370, 2.176470588235294]
[371, 20.31958762886598]
[303, 0.792016806722689]
[304, 0.175]
[305, 0.8818897637795275]
[306, 0.43243243243243246]
[490, 0.9182115594329335]
[491, 0.09523809523809523]
[213, 3.8679409209383144]
[307, 60.49528301886792]
[308, 7.857142857142857]
[492, 4.363636363636363]
[214, 2.517857142857143]
[429, 40.0

[820, 109.89285714285714]
[783, 0.2314410480349345]
[784, 40.81369863013698]
[785, 1.3431372549019607]
[786, 0.1527777777777778]
[557, 4.935348446683459]
[558, 0.3333333333333333]
[559, 0.015625]
[787, 36.73504273504273]
[560, 2.8208955223880596]
[788, 0.3]
[561, 10.554455445544555]
[562, 1.3970588235294117]
[789, 0.1848341232227488]
[563, 4.196969696969697]
[564, 5.878378378378378]
[565, 2.5]
[566, 0.11702127659574468]
[567, 0.9519230769230769]
[790, 2.435064935064935]
[791, 3.3289473684210527]
[821, 8.501239669421487]
[883, 13.23407917383821]
[792, 21.221995926680243]
[102, 2.3419607843137253]
[793, 3.2444444444444445]
[884, 51.411167512690355]
[103, 31.275316455696203]
[104, 3.9047619047619047]
[105, 0.08791208791208792]
[106, 0.5217391304347826]
[107, 0.09302325581395349]
[108, 0.2222222222222222]
[109, 0.7145790554414785]
[794, 4.031124497991968]
[110, 0.5988023952095808]
[822, 0.45614495798319327]
[111, 0.3076923076923077]
[112, 0.0]
[750, 2.036838066001535]
[113, 7.7619047619047

[1094, 1.1153846153846154]
[861, 11.28921078921079]
[983, 6.801431127012522]
[1095, 29.117117117117118]
[984, 89.85051546391753]
[985, 17.333333333333332]
[1096, 3.848605577689243]
[1205, 42.512693156732894]
[986, 11.646017699115044]
[987, 0.2608695652173913]
[862, 22.02906704172527]
[863, 0.047619047619047616]
[1206, 0.033962264150943396]
[1207, 0.0]
[864, 28.923809523809524]
[865, 69.9]
[1097, 29.689845474613687]
[1098, 8.444444444444445]
[866, 0.058823529411764705]
[867, 7.684931506849315]
[1099, 0.018633540372670808]
[1100, 0.042105263157894736]
[1160, 10.421491016727472]
[1161, 0.0]
[1162, 5.590697674418605]
[988, 90.11312700106724]
[1101, 3.016042780748663]
[1102, 0.06666666666666667]
[989, 0.134375]
[1103, 8.940298507462687]
[1104, 0.07017543859649122]
[1163, 23.41891891891892]
[1105, 0.43341404358353514]
[1164, 0.0]
[990, 8.35593220338983]
[1106, 9.456521739130435]
[1208, 32.49108589951378]
[868, 4.600320170757738]
[1165, 19.295566502463053]
[869, 1.175531914893617]
[870, 26.23

[1369, 2.28125]
[1515, 0.13768115942028986]
[1516, 0.013333333333333334]
[1422, 2.3466216216216216]
[1370, 1.1909871244635193]
[1299, 16.89374262101535]
[1371, 6.381909547738694]
[1372, 0.6515151515151515]
[1373, 5.486682808716707]
[1423, 5.419811320754717]
[1517, 87.075]
[1518, 0.4186046511627907]
[1300, 1.782383419689119]
[1301, 163.33333333333334]
[1519, 1.2186732186732188]
[1424, 0.8019169329073482]
[1577, 41.43479558340794]
[1425, 0.0]
[1578, 0.6506024096385542]
[1579, 62.925925925925924]
[1580, 4.758620689655173]
[1302, 36.3325]
[1520, 1.562874251497006]
[1426, 334.0412371134021]
[1427, 41.513513513513516]
[1374, 11.048489666136724]
[1581, 11.762993762993762]
[1428, 0.9026548672566371]
[1375, 36.14832535885167]
[1429, 0.03271028037383177]
[1521, 3.642982676374592]
[1376, 0.07769145394006659]
[1303, 24.805401111993646]
[1377, 0.140625]
[1378, 1.0096153846153846]
[1379, 0.29411764705882354]
[1380, 1.7916666666666667]
[1381, 0.03117505995203837]
[1382, 13.0]
[1582, 40.63034482758621

[1793, 38.258620689655174]
[1691, 1.5159386068476977]
[1692, 1.8541666666666667]
[1794, 0.0125]
[1693, 4.02734375]
[1857, 5.737061479680444]
[1694, 3.5541666666666667]
[1795, 1.7049180327868851]
[1556, 10.132313575525812]
[1695, 4.92962962962963]
[1557, 1.944055944055944]
[1558, 3.4097222222222223]
[1559, 16.811608961303463]
[373, 228.86823770491804]
[1796, 0.006941120153183341]
[1797, 0.25925925925925924]
[1696, 0.0811965811965812]
[1697, 0.10408921933085502]
[1698, 0.6403508771929824]
[1699, 0.24281984334203655]
[374, 63.373684210526314]
[375, 0.025]
[1560, 0.925645872715816]
[1561, 1.9554655870445343]
[1741, 0.5465356405326497]
[1562, 0.525065963060686]
[1742, 5.2272727272727275]
[1743, 0.3181818181818182]
[1563, 0.0]
[1744, 97.0]
[1700, 3.6190825688073396]
[1745, 2.1095890410958904]
[1746, 0.09090909090909091]
[376, 25.428770949720672]
[1701, 0.005405405405405406]
[1891, 0.9811320754716981]
[1892, 0.23529411764705882]
[1893, 4.631578947368421]
[1564, 0.0941346850108617]
[1565, 0.55

In [7]:
## Cosine type 3
import multiprocessing as mp
import time
import copy as cp


train_dic = {}
for item in train_set:
    train_dic[item[0]] = item[1:]
    
if __name__ == '__main__':
    
    def getPro(record:list) -> list:
        node = [record[0], common_neighbors3(record[1], record[-1])]
        print(node)
        return node
         
    # construct the # of pools corresponding to the cpu_count in ur PC
    pool = mp.Pool(mp.cpu_count())
    
    startTime = time.time()
    
    CN3 = pool.map(getPro, test_set)
    pool.close()
    pool.join()
    
    endTime = time.time()
    print("Total time:" + (endTime - startTime).__str__())
    
    CN3_result = sorted(CN3, key=lambda x: x[-1], reverse=True)
    CN3_result = cp.deepcopy(CN3_result)
    for item in CN3_result:
        index = CN3_result.index(item)
        if index < len(CN3_result)/2:
            CN3_result[index][-1] = 1
        else:
            CN3_result[index][-1] = 0
    CN3_result = sorted(CN3_result, key=lambda x: x[0], reverse=False)

    title = ["Id", "Predicted"]
    test_pd = pd.DataFrame(columns = title, data = CN3_result)
    test_pd.to_csv('/Users/dnhb/PycharmProjects/SML_Ass1/CN3Label.csv',encoding='utf-8')
    test_pd = pd.DataFrame(columns = title, data = CN3)
    test_pd.to_csv('/Users/dnhb/PycharmProjects/SML_Ass1/CN3Score.csv',encoding='utf-8')

[316, 62.2]
[64, 348.44444444444446]
[442, 8.5]
[443, 20.0]
[1, 0.9310344827586207]
[2, 3.4444444444444446]
[317, 9.661016949152541]
[3, 20.941176470588236]
[318, 20.366197183098592]
[127, 26.06153846153846]
[128, 6.0]
[319, 3.195445920303605]
[4, 51.30555555555556]
[190, 11.59016393442623]
[191, 447.0]
[320, 12.0]
[5, 1.173913043478261]
[6, 0.0]
[7, 23.625]
[321, 116.92307692307692]
[253, 861.7752808988764]
[254, 42.0]
[255, 1.0]
[444, 2.6721044045677]
[445, 74.0]
[256, 5.945652173913044]
[257, 9.6]
[446, 3.1739130434782608]
[447, 0.0]
[129, 5.288659793814433]
[258, 79.7939393939394]
[448, 8.2]
[259, 8.0]
[8, 36.14]
[130, 26.0]
[9, 53.25]
[260, 7.0]
[379, 3.3120567375886525]
[131, 21.806451612903224]
[132, 10.428571428571429]
[322, 13.707317073170731]
[133, 59.516666666666666]
[10, 5.333333333333333]
[11, 9.5]
[12, 28.0]
[13, 34.0]
[261, 38.75757575757576]
[14, 0.0]
[449, 1.984375]
[450, 3.6666666666666665]
[323, 40.0]
[262, 117.18181818181819]
[134, 40.10497237569061]
[135, 28.8]
[15

[240, 3.6666666666666665]
[42, 6.595744680851064]
[439, 22.645833333333332]
[440, 5.333333333333333]
[441, 26.0]
[505, 39.5]
[43, 36.5]
[44, 3.3333333333333335]
[300, 20.134615384615383]
[301, 320.0]
[302, 1.6]
[506, 6.663865546218488]
[507, 110.9]
[178, 24.76]
[179, 2.5]
[508, 16.0]
[45, 11.0]
[303, 35.833333333333336]
[241, 512.4606741573034]
[46, 9.636363636363637]
[304, 6.666666666666667]
[242, 1.8333333333333333]
[489, 15.221318879855465]
[180, 1.8125]
[243, 21.90909090909091]
[244, 4.0]
[490, 58.05882352941177]
[245, 0.0]
[491, 3.0]
[47, 563.25]
[48, 18.0]
[509, 3.34]
[305, 3.2195121951219514]
[306, 1.5454545454545454]
[510, 6.0285714285714285]
[511, 0.75]
[307, 53.57225433526011]
[308, 2.2]
[512, 16.0]
[97, 511.50114942528734]
[513, 0.0]
[492, 4.5514018691588785]
[246, 15.96029776674938]
[514, 1.5]
[247, 36.333333333333336]
[493, 20.4]
[49, 7.005364806866953]
[248, 17.5]
[50, 110.45833333333333]
[51, 0.0]
[52, 18.529411764705884]
[494, 0.6428571428571429]
[53, 0.0]
[309, 4963.63

[808, 32.0]
[809, 5.375]
[545, 752.8082191780821]
[615, 7.733668341708543]
[741, 40.44444444444444]
[742, 34.0]
[907, 11.0]
[810, 54.56521739130435]
[852, 378.15686274509807]
[546, 14.24401913875598]
[547, 246.5]
[660, 1699.2808988764045]
[548, 23.857142857142858]
[549, 58.0]
[743, 45.705882352941174]
[661, 8.711864406779661]
[662, 37.0]
[616, 2.0864197530864197]
[908, 83.71428571428571]
[663, 16.95]
[909, 19.2]
[811, 3.944277108433735]
[910, 0.7142857142857143]
[853, 5.25]
[812, 14.8]
[664, 42.785714285714285]
[813, 6.0]
[854, 11.588235294117647]
[814, 5.0]
[617, 511.6]
[815, 7.0]
[618, 38.5]
[744, 2.176470588235294]
[745, 26.25]
[550, 267.69444444444446]
[665, 29.0]
[551, 112.5]
[960, 3.4178082191780823]
[816, 54.773972602739725]
[911, 0.011235955056179775]
[817, 408.625]
[818, 42.904761904761905]
[666, 115.53333333333333]
[552, 14.232758620689655]
[819, 113.0]
[855, 14.348837209302326]
[856, 33.0]
[1009, 214.35714285714286]
[746, 351.8965517241379]
[553, 13.342857142857143]
[857, 16

[1331, 38.0]
[1393, 14.0]
[1057, 47.868263473053894]
[1394, 12.0]
[1332, 7.896551724137931]
[1395, 1.0]
[1333, 11.5]
[1396, 7.571428571428571]
[1226, 20.418803418803417]
[1153, 31.294117647058822]
[1397, 184.0]
[1334, 2.7983193277310923]
[1398, 2.0]
[1335, 0.0]
[1154, 92.46666666666667]
[1155, 12.333333333333334]
[1336, 6.857142857142857]
[1156, 17.933823529411764]
[1103, 7.011494252873563]
[1157, 1.0]
[1158, 3.8676470588235294]
[1278, 148.29]
[1279, 0.0]
[1104, 5.5]
[1105, 70.33333333333333]
[1337, 252.1875]
[1280, 16.19047619047619]
[1281, 0.4166666666666667]
[1106, 10.013698630136986]
[1107, 206.87755102040816]
[1108, 13.2]
[1399, 5.689380530973452]
[1338, 1.992084432717678]
[1400, 15.0]
[1339, 13.785714285714286]
[1401, 2.6511627906976742]
[1402, 2.0]
[1403, 99.13333333333334]
[1282, 40.2]
[1109, 33.3]
[1058, 0.35064935064935066]
[1283, 30.666666666666668]
[1059, 96.5]
[1284, 0.6666666666666666]
[1340, 29.073170731707318]
[1285, 20.25]
[1341, 200.0952380952381]
[1060, 14.9302325581

[1836, 119.33333333333333]
[1837, 5.0]
[1894, 33.310077519379846]
[1588, 3.09375]
[1838, 24.227272727272727]
[1773, 385.5128205128205]
[1533, 1.6585365853658536]
[1534, 32.5]
[1535, 5.375]
[1536, 7120.0]
[1537, 23.6]
[1649, 485.4738805970149]
[1708, 115.02338129496403]
[1709, 121.0]
[1710, 4.288461538461538]
[1895, 1.435897435897436]
[1589, 22.869565217391305]
[1896, 0.6666666666666666]
[1590, 497.0]
[1591, 3.3333333333333335]
[1592, 181.0]
[1897, 7.831932773109243]
[1898, 65.0]
[1839, 5.912280701754386]
[1899, 14.741935483870968]
[1538, 4.075]
[1650, 12.916666666666666]
[1651, 1.0]
[1840, 7.538461538461538]
[1841, 24.58169934640523]
[1539, 0.7719298245614035]
[1774, 43.50625]
[1711, 2.6660436137071652]
[1481, 1.9186046511627908]
[1900, 3.554913294797688]
[1482, 2.9655172413793105]
[1483, 31.0]
[1484, 288.0]
[1652, 3.6692913385826773]
[1775, 23.4375]
[1653, 22.0]
[1776, 29.444444444444443]
[1654, 1.3333333333333333]
[1655, 117.0]
[1593, 8.044742729306488]
[1594, 32.0]
[1595, 4.66666666

[1699, 4.484848484848484]
[1943, 18.01015228426396]
[1700, 495.05]
[1701, 1.0]
[1977, 3.4022988505747125]
[1978, 11.0]
[1944, 5.734597156398104]
[1812, 0.7634069400630915]
[1945, 58.714285714285715]
[1946, 28.476190476190474]
[1882, 22.11764705882353]
[1813, 17.5]
[1883, 129.66666666666666]
[1979, 0.8721804511278195]
[1814, 14.0]
[1980, 15.72]
[1884, 1.8620689655172413]
[1749, 50.20875420875421]
[1947, 5.1625]
[1815, 38.1764705882353]
[1948, 23.36]
[1885, 10.228571428571428]
[1816, 56.42718446601942]
[1817, 271.92]
[1750, 53.65953947368421]
[1751, 1.5247524752475248]
[1981, 23.35992578849722]
[1982, 58.75]
[1983, 28.333333333333332]
[1818, 8.37037037037037]
[1819, 2.0]
[1984, 0.2463768115942029]
[1752, 375.44444444444446]
[1985, 34.6]
[1986, 94.875]
[1753, 16.430232558139537]
[1754, 38.111111111111114]
[1949, 44.14629049111808]
[1820, 0.9796610169491525]
[1821, 49.5]
[1886, 6.491860465116279]
[1822, 356.0]
[1755, 343.75]
[1756, 3.0]
[1950, 42.61971830985915]
[1823, 27.261904761904763]
