In [1]:
"""
Data Preprocessing
"""

import os
import re
import pandas as pd

# read train data
file_dir = '/Users/dnhb/PycharmProjects/SML_Ass1'
file_name = "train.txt"

def parse(data:list)->list: 
    # Python Notation: https://www.python.org/dev/peps/pep-3107/
    """
    Prase each line of the data
    Return a nested list, such as [[id1, id2, id2,...],...]
    """
    parsed_data = []
    for line in data:
        # remove \n at the end of each line
        line = re.sub(r"(?<=\d)\n", "", line)
        # split IDs by \t
        pattern = re.compile("(?<=\d)\t(?=\d)")
        line = re.split(pattern, line)
        parsed_data.append(line)
    return parsed_data

with open(os.path.join(file_dir, file_name)) as f:
    train_set = f.readlines()
    train_set = parse(train_set)
    
# read test data
test_name = "test-public.txt"
with open(os.path.join(file_dir, test_name)) as f:
    test_set = f.readlines()
    test_set = parse(test_set)
    test_set.pop(0)

# turn ID into int
for i in range(0, len(test_set)):
    test_set[i][0] = int(test_set[i][0])

In [2]:
# Similarity

train_dic = {}
for item in train_set:
    train_dic[item[0]] = item[1:]

def getSigmaOut(node:str, graph:dict) -> set:
    SigmaOut = set()
    try:
        for item in graph[node]:
            SigmaOut.add(item)
        return SigmaOut
    except:
        return SigmaOut

def jaccard1(a:str, b:str) -> float:
    SigmaA = getSigma(a)
    SigmaB = getSigma(b)
    return float(len(SigmaA.intersection(SigmaB)) / len(SigmaA.union(SigmaB)))

def jaccard2(a:str, b:str) -> float:
    score = 0
    SigmaOutA = SigmaOut[a]
    if SigmaOutA:
        for node in SigmaOutA:
            score += jaccard1(b, node)
        return float(score/len(SigmaOutA))
    else:
        return 0

In [3]:
from collections import defaultdict

# construct Sigma Out dic
SigmaOut = defaultdict(set)

for kw in train_dic:
    SigmaOut[kw] = getSigmaOut(kw, train_dic)

# construct Sigma In dic
SigmaIn = defaultdict(set)

for kw in train_dic:
    for followee in train_dic[kw]:
        if SigmaIn[followee]:
            SigmaIn[followee].add(kw)
        else:
            SigmaIn[followee] = {kw}

def getSigma(node:str) -> set:
    return SigmaIn[node].union(SigmaOut[node])

In [4]:
## Cosine type 2
import multiprocessing as mp
import time
import copy as cp


train_dic = {}
for item in train_set:
    train_dic[item[0]] = item[1:]
    
if __name__ == '__main__':
    
    def getPro(record:list) -> list:
        node = [record[0], jaccard2(record[1], record[-1])]
        print(node)
        return node
         
    # construct the # of pools corresponding to the cpu_count in ur PC
    pool = mp.Pool(mp.cpu_count())
    
    startTime = time.time()
    
    Jaccard2 = pool.map(getPro, test_set)
    pool.close()
    pool.join()
    
    endTime = time.time()
    print("Total time:" + (endTime - startTime).__str__())
    
    Jaccard2_result = sorted(Jaccard2, key=lambda x: x[-1], reverse=True)
    Jaccard2_result = cp.deepcopy(Jaccard2_result)
    for item in Jaccard2_result:
        index = Jaccard2_result.index(item)
        if index < len(Jaccard2_result)/2:
            Jaccard2_result[index][-1] = 1
        else:
            Jaccard2_result[index][-1] = 0
    Jaccard2_result = sorted(Jaccard2_result, key=lambda x: x[0], reverse=False)

    title = ["Id", "Predicted"]
    test_pd = pd.DataFrame(columns = title, data = Jaccard2_result)
    test_pd.to_csv('/Users/dnhb/PycharmProjects/SML_Ass1/Jaccard2Label.csv',encoding='utf-8')
    test_pd = pd.DataFrame(columns = title, data = Jaccard2)
    test_pd.to_csv('/Users/dnhb/PycharmProjects/SML_Ass1/Jaccard2Score.csv',encoding='utf-8')

[379, 0.02952093459578109]
[1, 0.004756335876775658]
[442, 0.000668210329735728]
[190, 0.008743888983211915]
[2, 0.0004547391940738874]
[127, 0.012449053139136003]
[3, 0.006910701545848949]
[128, 0.0]
[129, 0.005455040004973148]
[316, 0.0010560980236248873]
[317, 0.010086430778857151]
[380, 0.028563740024530503]
[381, 0.00794202481009174]
[318, 0.07838691777138282]
[130, 0.005015243610008192]
[319, 0.0638754111101203]
[4, 0.014270810150223082]
[5, 0.006324196267663808]
[382, 0.04853695262731571]
[6, 0.0]
[383, 0.054284439177297694]
[443, 0.002340868371171295]
[444, 0.01709329719390961]
[320, 0.00782955342692608]
[7, 0.00303141268625669]
[384, 0.03708215296325779]
[8, 0.0026215966171755155]
[385, 0.0018224966289198253]
[445, 0.0021636348305578584]
[446, 0.0055838014654252185]
[447, 0.0]
[448, 0.014361626595026097]
[131, 0.006412839631341237]
[132, 0.024707824076822708]
[191, 0.008316549176597094]
[449, 0.0020176766460270014]
[450, 0.00022171674376920373]
[451, 0.007683449188954051]
[9, 

[164, 0.0315356636221561]
[289, 0.028177395878554546]
[165, 0.00016703478676326723]
[420, 0.05936742706257394]
[507, 0.0016342757962796396]
[508, 0.0004578039999241158]
[509, 0.0012398754653681829]
[510, 0.005049596143163747]
[290, 0.016741481121915886]
[511, 1.1753580602462611e-05]
[512, 0.0011365027084153172]
[513, 0.0]
[514, 0.0]
[515, 0.018622017086392793]
[166, 0.027284744428693143]
[421, 0.010783041471484888]
[167, 1.4756660725761396e-05]
[168, 0.003815093465172204]
[422, 0.04476945259655713]
[291, 0.008854524535454766]
[485, 0.006050844187054548]
[169, 0.030253534853706417]
[486, 0.01792075800195873]
[516, 0.015171973734461355]
[517, 0.014779774786263212]
[518, 0.005179276456956485]
[292, 0.002595452999177406]
[519, 0.012037552306098336]
[520, 0.004519508002957516]
[293, 0.005305000942024012]
[423, 0.0341763361018592]
[424, 0.0015499521323403848]
[356, 0.003871232817177986]
[487, 0.021350514820570262]
[357, 0.0290924430556409]
[488, 0.0036836201166770438]
[425, 0.001961449266584

[757, 0.015372374831963265]
[758, 0.001401089914533661]
[759, 0.007791579204133194]
[645, 0.00462714657534268]
[734, 0.02886065107189823]
[735, 0.0]
[736, 0.0021911235459521777]
[646, 0.0501041897943802]
[647, 0.0012412636701001185]
[648, 0.013271952437606696]
[649, 0.0006671193356436067]
[650, 0.006390786736191453]
[737, 0.007963069515590355]
[738, 0.04195687895366854]
[739, 0.0011661742059616313]
[651, 0.016404208117929003]
[740, 0.013293448509685554]
[652, 0.0026456037472520066]
[760, 0.0025924680567104767]
[741, 0.03126017834106442]
[761, 0.007909353455555357]
[653, 0.02469455587007862]
[654, 0.0021800015784246385]
[762, 0.0973162302794513]
[763, 0.0001529110711034786]
[742, 0.0011690270089265908]
[176, 0.0096540736507089]
[177, 0.028041573818920213]
[178, 0.04428885290575299]
[179, 0.003067606029278503]
[180, 0.007923032788538932]
[764, 0.0030675448808289573]
[765, 0.0]
[655, 0.050401775222712226]
[656, 0.015251227373317409]
[743, 0.01293115328687244]
[766, 0.010803278363510279]
[

[1025, 0.026624903957353122]
[908, 0.008344349705820575]
[1026, 0.017544005615025254]
[909, 0.005685812108800577]
[690, 0.03900537471412711]
[959, 0.03701771003843738]
[691, 0.02446537027021276]
[960, 0.008785578126308443]
[910, 0.00016440197106746168]
[911, 0.005319148936170213]
[912, 0.01463566635154835]
[913, 0.058989782789964175]
[914, 0.0034180030537179136]
[915, 0.0009379012099712462]
[916, 0.0]
[917, 0.09990949113435914]
[918, 1.839413613032758e-06]
[692, 0.032641887683935944]
[1027, 0.020445049919219728]
[693, 0.1262965581401521]
[1198, 0.005679515724633352]
[1199, 0.013191790106519058]
[1028, 0.004173320648276985]
[1200, 0.0003187072575921128]
[919, 0.02421032363022123]
[1138, 0.0001443847788914847]
[847, 0.04589626412111232]
[920, 0.033381288915674635]
[848, 0.02211410292926782]
[1087, 0.0026143440871597857]
[921, 0.0031711436296683374]
[922, 5.647067942536676e-05]
[1029, 0.019558228737314203]
[961, 0.06818528481311917]
[849, 0.010195537893502516]
[962, 0.005468469044928343]


[1335, 0.0]
[1278, 0.14963419867320915]
[1279, 0.0]
[1336, 0.0027381211518098542]
[1280, 0.003666703162572749]
[1281, 0.001410324311548632]
[1282, 0.01497806832706986]
[1061, 0.06268379625051142]
[1117, 0.043274504366546215]
[1062, 0.0]
[1118, 0.0050028852204857255]
[1387, 0.0036513370264560497]
[1388, 0.0035983917153922565]
[1389, 0.004419476072650602]
[1390, 0.0]
[1391, 0.009899546203707662]
[1337, 0.04223693715742583]
[1338, 0.03846951964341802]
[991, 0.0003908931981816964]
[1339, 0.005389296688166187]
[992, 0.008996795776334101]
[1119, 2.630300423267164e-05]
[1063, 0.001659207063092407]
[1064, 0.04135413946198354]
[1065, 0.0]
[993, 0.015751198841625437]
[994, 0.005105046783858816]
[1066, 0.000759548152406016]
[1120, 0.014301681108853445]
[1340, 0.007730674919557057]
[995, 0.009953081861055545]
[996, 0.0]
[1067, 0.0027794631426660155]
[1392, 0.010980274418656464]
[1341, 0.014731730137222536]
[1393, 0.0013175145670296488]
[1394, 0.02245872978060346]
[1395, 7.141632862937782e-06]
[997

[1622, 0.008486240751971236]
[1623, 0.07208311894654511]
[1624, 0.01603649223209387]
[1706, 0.022393311329498455]
[1707, 0.0014203181573487268]
[1625, 0.0]
[1649, 0.03317028061535334]
[1440, 0.0101924977811201]
[1650, 0.009136689720694943]
[1651, 5.982451316605921e-08]
[1461, 0.01933313397469282]
[1626, 0.002753897257905991]
[1627, 0.017839487624239914]
[1441, 0.029158836803584838]
[1462, 0.00569480114753267]
[1652, 0.0006860181019018193]
[1628, 0.014446529275681822]
[1442, 0.00484524288500828]
[1443, 0.03353664522447648]
[1653, 0.00046848079555744036]
[1654, 0.01904761904761905]
[1536, 0.003474785674073338]
[1537, 0.03494196630497364]
[1538, 0.02299301602459273]
[1539, 0.0003184541861159502]
[1444, 0.039430243425409045]
[1655, 0.00897964973431122]
[1445, 0.03589085245366685]
[1656, 0.002801595165596931]
[1708, 0.010636751534131498]
[1446, 0.0055752481034314705]
[1657, 0.0003287905684253719]
[1658, 0.09946744821224034]
[1447, 0.0001049594972246906]
[1448, 0.002962872990375993]
[1449, 0

[1808, 0.0004874731405149975]
[1241, 0.009898459003715554]
[1809, 0.01983315235355314]
[1810, 0.01314699233483502]
[1750, 0.03332616427049079]
[1751, 0.003176321826903877]
[1811, 0.010695262251501518]
[1812, 0.006097560975609756]
[1813, 0.005249833617574858]
[1814, 0.022346420984677876]
[1815, 0.03259946605881615]
[1816, 0.018570823845977285]
[1242, 0.03594721203415846]
[1752, 0.028803509206484428]
[1913, 0.012124784698187915]
[1817, 0.059295633921837135]
[1753, 0.029582924197483828]
[1754, 0.03290235117456994]
[1818, 0.02075363853436559]
[1819, 4.119871789589908e-06]
[1858, 0.0015282656334822703]
[1820, 0.006965083697894856]
[1243, 0.009843701625986733]
[1958, 0.003269148365351003]
[1244, 0.0030252349609141216]
[1959, 0.016193899456151652]
[1960, 0.022798757742688223]
[1821, 0.009542298101051774]
[1961, 0.040078292155778794]
[1962, 0.048992880100657506]
[1245, 0.008469783354719426]
[1914, 0.030538834581548886]
[1246, 0.005466517996752854]
[1915, 0.007481643061941064]
[1247, 0.03472101