In [1]:
import pickle
import numpy as np
import pandas as pd

In [2]:
f = open('final_dataset.pickle', 'rb')    
reportDF = pickle.load(f)

In [3]:
reportDF.to_csv('final_dataset.csv', sep=";")

# Analyse data from BIB entries

In [3]:
reportDF.columns

Index(['Link', 'BibtexID', 'ResponsiblePerson', 'Title', 'StudyType',
       'Dataset', 'Domain of tests [other]', 'DataPyramid', 'DatasetFeatures',
       'Coupled/Decoupled', 'GroundTruthType', 'RecommendationType',
       'GroupSizes', 'Baselines', 'GroupConstruction', 'GroupDuration',
       'EvaluationDimensions', 'ReproducibleCode', 'ReproducibleData',
       'ReproducibleGroups', 'ReproducibleEvalProtocol', 'Notes',
       'ParsedDataset', 'DatasetHighLevel', 'ParsedDatasetHighLevel',
       'ParsedBaselines', 'EvaluationDimensionsHighLevel', 'ParsedDomain',
       'ParsedGroupConstruction', 'ParsedGroupConstructionHighLevel',
       'ParsedGroupSizes', 'bib_text', 'bib_ID', 'bib_title', 'bib_authors',
       'bib_year', 'bib_venue', 'bib_venue_long', 'bib_venue_type', 'bib_doi',
       'bib_url', 'bib_publisher'],
      dtype='object')

In [4]:
reportDF.bib_year.value_counts()
# decide on papers from 2016-2018

2020    23
2021    21
2022    20
2019    17
2017    10
2023     9
2018     7
2016     5
Name: bib_year, dtype: int64

In [5]:
reportDF.bib_venue_type.value_counts()

article          73
inproceedings    33
inbook            6
Name: bib_venue_type, dtype: int64

In [6]:
reportDF.bib_publisher.value_counts()

Elsevier BV                                                    33
IEEE                                                           26
ACM                                                            24
Springer                                                       19
CEUR-WS                                                         2
Wiley                                                           2
International World Wide Web Conferences Steering Committee     1
Walter de Gruyter GmbH                                          1
Research Square Platform LLC                                    1
Hindawi Limited                                                 1
Informa UK Limited                                              1
IOS Press                                                       1
Name: bib_publisher, dtype: int64

In [7]:
reportDF.loc[reportDF.bib_venue_type == "article"].bib_venue.value_counts()[0:10]

Expert Systems with Applications                             10
Information Sciences                                          6
IEEE Transactions on Knowledge and Data Engineering           5
Knowledge-Based Systems                                       4
Decision Support Systems                                      4
Multimedia Tools and Applications                             4
IEEE Access                                                   2
ACM Transactions on Information Systems                       2
Applied Intelligence                                          2
IEEE Transactions on Neural Networks and Learning Systems     2
Name: bib_venue, dtype: int64

In [8]:
reportDF.loc[reportDF.bib_venue_type != "article"].bib_venue.value_counts()[0:10]

RecSys       6
SIGIR        5
UMAP         4
CIKM         3
ICDE         3
WSDM         2
SAC          1
FUZZ-IEEE    1
DASFAA       1
BigData      1
Name: bib_venue, dtype: int64

In [9]:
authorsList = []
[ authorsList.extend(el) for el in reportDF["bib_authors"] if isinstance(el, list)] 
allAuthors = pd.Series(authorsList)
allAuthors.shape, allAuthors.unique().shape

((401,), (305,))

In [10]:
allAuthors.value_counts()[0:50]

Yin, Hongzhi                 7
Lu, Jie                      4
Yera, Raciel                 4
Martínez, Luis               4
Pitoura, Evaggelia           4
Xu, Guandong                 3
Zheng, Kai                   3
Guo, Lei                     3
Yu, Li                       3
Leng, Youfang                3
Chowdary, C. Ravindranath    3
Peska, Ladislav              3
Zhang, Yujie                 3
Taleai, Mohammad             3
Wang, Qinyong                3
Guo, Zhiwei                  3
Bahari Sojahrood, Zahra      3
Castro, Jorge                3
Meng, Xiangwu                3
Stefanidis, Kostas           3
Huang, Zhenhua               3
Zhang, Guangquan             3
Du, Yulu                     3
Xu, Xin                      2
Wang, Peipei                 2
Chen, Yunwen                 2
Lv, Pengtao                  2
Yu, Keping                   2
Wang, Lei                    2
Li, Lin                      2
Sisodia, Dilip Singh         2
Pujahari, Abinash            2
Guo, Jun

# Construct Nodes and Edges for Force-directed Layout

In [11]:
#these edges actually messed the graphs quite a bit so as a first solution, I just removed them.
# Alternatively, we could figure out some way to decrease their weight
def clean_list(listIn):
    blacklist = ["other","item-based","-","Not clear","Unclear / not specified","unknown / not applicable"]
    return [i for i in listIn if i not in blacklist]


# as an alternative, we can consider to use directed edges for some cases
def jaccard_similarity(list1, list2):
    # intersection of two sets
    set1 = set(clean_list(list1))
    set2 = set(clean_list(list2))
    intersection = len(set1.intersection(set2))
    # Unions of two sets
    union = len(set1.union(set2))
    strRep = str(list(set1.intersection(set2)))
    try:
        return (intersection / union, strRep)
    except:
        return (0,"")


In [12]:
#for each of reported values, get its best fit; then report on the average fit for list 1 
# (i.e., to what extent are sizes used in list1 reflected in list2)
def groupSizeSimilarity(list1, list2):
    # intersection of two sets
    dst = []
    for i in list1:
        dst_i = []
        for j in list2:
            d = abs(i-j) / np.max([i,j])
            dst_i.append(d)
        if len(dst_i) >0:
            dst.append(np.min(dst_i))
    if len(dst) >0:
        meanDistance = np.mean(dst)
        return 1 - meanDistance
    else:
        return 0 #i.e., one of the lists is empty  

In [13]:
reportDF.columns

Index(['Link', 'BibtexID', 'ResponsiblePerson', 'Title', 'StudyType',
       'Dataset', 'Domain of tests [other]', 'DataPyramid', 'DatasetFeatures',
       'Coupled/Decoupled', 'GroundTruthType', 'RecommendationType',
       'GroupSizes', 'Baselines', 'GroupConstruction', 'GroupDuration',
       'EvaluationDimensions', 'ReproducibleCode', 'ReproducibleData',
       'ReproducibleGroups', 'ReproducibleEvalProtocol', 'Notes',
       'ParsedDataset', 'DatasetHighLevel', 'ParsedDatasetHighLevel',
       'ParsedBaselines', 'EvaluationDimensionsHighLevel', 'ParsedDomain',
       'ParsedGroupConstruction', 'ParsedGroupConstructionHighLevel',
       'ParsedGroupSizes', 'bib_text', 'bib_ID', 'bib_title', 'bib_authors',
       'bib_year', 'bib_venue', 'bib_venue_long', 'bib_venue_type', 'bib_doi',
       'bib_url', 'bib_publisher'],
      dtype='object')

In [14]:
reportDF.head(5)

Unnamed: 0,Link,BibtexID,ResponsiblePerson,Title,StudyType,Dataset,Domain of tests [other],DataPyramid,DatasetFeatures,Coupled/Decoupled,...,bib_ID,bib_title,bib_authors,bib_year,bib_venue,bib_venue_long,bib_venue_type,bib_doi,bib_url,bib_publisher
1,https://www.sciencedirect.com/science/article/...,,LP,A novel group recommender system based on memb...,offline,ml100k,movies,No_groups,,Coupled,...,Barzegar_Nozari_2020,A novel group recommender system based on memb...,"[Barzegar Nozari, Reza, Koohi, Hamidreza]",2020,Knowledge-Based Systems,,article,10.1016/j.knosys.2020.106296,http://dx.doi.org/10.1016/j.knosys.2020.106296,Elsevier BV
2,https://www.sciencedirect.com/science/article/...,,LP,Member contribution-based group recommender sy...,offline,"ml1m,ml100k,jester","movies,humor[tourism]",No_groups,,Mixture or hybrid,...,Wang_2016,Member contribution-based group recommender sy...,"[Wang, Wei, Zhang, Guangquan, Lu, Jie]",2016,Decision Support Systems,,article,10.1016/j.dss.2016.05.002,http://dx.doi.org/10.1016/j.dss.2016.05.002,Elsevier BV
3,https://www.sciencedirect.com/science/article/...,,LP,Content-based group recommender systems: A gen...,offline,"ml100k,hetrec",movies,No_groups,,Coupled,...,P_rez_Almaguer_2021,Content-based group recommender systems: A gen...,"[Pérez-Almaguer, Yilena, Yera, Raciel, Alzahra...",2021,Expert Systems with Applications,,article,10.1016/j.eswa.2021.115444,http://dx.doi.org/10.1016/j.eswa.2021.115444,Elsevier BV
4,https://www.sciencedirect.com/science/article/...,,LP,Personalized hybrid recommendation for group o...,offline,ml10m,movies,No_groups,,Coupled,...,Ka_k_2016,Personalized hybrid recommendation for group o...,"[Kaššák, Ondrej, Kompan, Michal, Bieliková, Má...",2016,Information Processing &amp; Management,,article,10.1016/j.ipm.2015.10.001,http://dx.doi.org/10.1016/j.ipm.2015.10.001,Elsevier BV
6,https://ieeexplore.ieee.org/abstract/document/...,,LP,Opinion Dynamics-Based Group Recommender Systems,offline,"ml100k,ml1m",movies,No_groups,,Coupled,...,Castro_2018,Opinion Dynamics-Based Group Recommender Systems,"[Castro, Jorge, Lu, Jie, Zhang, Guangquan, Don...",2018,"IEEE Transactions on Systems, Man, and Cyberne...",,article,10.1109/tsmc.2017.2695158,http://dx.doi.org/10.1109/tsmc.2017.2695158,IEEE


In [15]:
link_width_factor = 10

In [16]:
raw_data = reportDF
#item1, item2, edgeType, edgeWeight
edgeListAuthors = []
edgeListVenue = []
edgeListYear = []

for i,r1 in raw_data.iterrows():
    for j,r2 in raw_data.iterrows():
        if i < j:
            val, simAuthors = jaccard_similarity(r1["bib_authors"],r2["bib_authors"])
            if val > 0:
                 
                edgeListAuthors.append({"source":i, "target":j, "type":"same_authors","value":val*link_width_factor, "label":simAuthors})

            if r1["bib_venue"] == r2["bib_venue"]:
                edgeListVenue.append({"source":i, "target":j, "type":"same_venue","value":1.0*link_width_factor, "label":r1["bib_venue"]}) 
            if r1["bib_year"] == r2["bib_year"]:
                edgeListYear.append({"source":i, "target":j, "type":"same_year","value":1.0*link_width_factor, "label":r1["bib_year"]})                 


In [17]:
for i,r1 in raw_data.iterrows():
    for j,r2 in raw_data.iterrows():
        if int(r2["bib_year"]) - int(r1["bib_year"]) >0: #first is older
            val = 1/( 1 + int(r2["bib_year"]) - int(r1["bib_year"]))
            if val > 0.1:
                label = str(r1["bib_year"]) + " vs." + str(r2["bib_year"])
                edgeListYear.append({"source":i, "target":j, "type":"year_olderThan","value":val*link_width_factor, "label":label})

In [18]:
nodeList = []
for i,r1 in raw_data.iterrows():
    nodeList.append({"id":i, "label":r1["bib_ID"], "title":r1["bib_title"], "year":r1["bib_year"], "venue":r1["bib_venue"]})

In [19]:
import json
out_file = open("graphEdgesAuthors.json", "w") 
json.dump({"nodes":nodeList, "links":edgeListAuthors}, out_file, indent = 3)   
out_file.close() 

out_file = open("graphEdgesVenue.json", "w") 
json.dump({"nodes":nodeList, "links":edgeListVenue}, out_file, indent = 3)   
out_file.close() 

out_file = open("graphEdgesYear.json", "w") 
json.dump({"nodes":nodeList, "links":edgeListYear}, out_file, indent = 3)   
out_file.close() 

In [20]:
pd.DataFrame(edgeListAuthors).to_csv("graphEdgesAuthors.csv", sep=";")
pd.DataFrame(edgeListVenue).to_csv("graphEdgesVenue.csv", sep=";")
pd.DataFrame(edgeListYear).to_csv("graphEdgesYear.csv", sep=";")

In [None]:
#there might be some inconsistencies due to duplicates drop (I'll mention it during the meeting)

In [21]:
#item1, item2, edgeType, edgeWeight
edgeListBaselines = []
edgeListDatasets = []
edgeListGroupConstruction = []
edgeListEvalMetrics = []
edgeListOther = []

for i,r1 in raw_data.iterrows():
    for j,r2 in raw_data.iterrows():
        if i < j:
            val, simBaselines = jaccard_similarity(r1["ParsedBaselines"],r2["ParsedBaselines"])
            if val > 0:
                edgeListBaselines.append({"source":i, "target":j, "type":"same_baseliens","value":val*link_width_factor, "label":simBaselines})
                
            val, simDatasets = jaccard_similarity(r1["ParsedDatasetHighLevel"],r2["ParsedDatasetHighLevel"])
            if val > 0:
                edgeListDatasets.append({"source":i, "target":j, "type":"same_datasets","value":val*link_width_factor, "label":simDatasets}) 
                
            val, simGCS = jaccard_similarity(r1["ParsedGroupConstructionHighLevel"],r2["ParsedGroupConstructionHighLevel"])
            if val > 0:
                edgeListGroupConstruction.append({"source":i, "target":j, "type":"same_groupConstruction","value":val*link_width_factor, "label":simGCS}) 

            val, simEvDim = jaccard_similarity(r1["EvaluationDimensionsHighLevel"],r2["EvaluationDimensionsHighLevel"])
            if val > 0:
                edgeListEvalMetrics.append({"source":i, "target":j, "type":"similar_EvalMetrics","value":val*link_width_factor, "label":simEvDim}) 
                
                
            if r1["StudyType"] == r2["StudyType"]:
                edgeListOther.append({"source":i, "target":j, "type":"same_studyType","value":1.0*link_width_factor, "label":r1["StudyType"]}) 
            if r1["DataPyramid"] == r2["DataPyramid"]:
                edgeListOther.append({"source":i, "target":j, "type":"same_dataPyramid","value":1.0*link_width_factor, "label":r1["DataPyramid"]}) 
            if r1["Coupled/Decoupled"] == r2["Coupled/Decoupled"]:
                edgeListOther.append({"source":i, "target":j, "type":"same_evalStyle","value":1.0*link_width_factor, "label":r1["Coupled/Decoupled"]})                 
            if r1["GroundTruthType"] == r2["GroundTruthType"]:
                edgeListOther.append({"source":i, "target":j, "type":"same_groundTruth","value":1.0*link_width_factor, "label":r1["GroundTruthType"]})                 
            if r1["RecommendationType"] == r2["RecommendationType"]:
                edgeListOther.append({"source":i, "target":j, "type":"same_recsType","value":1.0*link_width_factor, "label":r1["RecommendationType"]})                 
            if r1["GroupDuration"] == r2["GroupDuration"]:
                edgeListOther.append({"source":i, "target":j, "type":"same_groupDuration","value":1.0*link_width_factor, "label":r1["GroupDuration"]})                 
                
                


In [22]:
#item1, item2, edgeType, edgeWeight
edgeListGroupSizes = []

for i,r1 in raw_data.iterrows():
    for j,r2 in raw_data.iterrows():
            val = groupSizeSimilarity(r1["ParsedGroupSizes"],r2["ParsedGroupSizes"])
            if val > 0.2:
                label = str(r1["ParsedGroupSizes"]) + "vs." + str(r2["ParsedGroupSizes"])
                edgeListGroupSizes.append({"source":i, "target":j, "type":"similar_groupSizes","value":val*link_width_factor, "label":label})

In [23]:
import json
out_file = open("graphEdgesDatasets.json", "w") 
json.dump({"nodes":nodeList, "links":edgeListDatasets}, out_file, indent = 3)   
out_file.close() 

out_file = open("graphEdgesBaselines.json", "w") 
json.dump({"nodes":nodeList, "links":edgeListBaselines}, out_file, indent = 3)   
out_file.close() 

out_file = open("graphEdgesGroupConstruction.json", "w") 
json.dump({"nodes":nodeList, "links":edgeListGroupConstruction}, out_file, indent = 3)   
out_file.close() 

out_file = open("graphEdgesEvalMetrics.json", "w") 
json.dump({"nodes":nodeList, "links":edgeListEvalMetrics}, out_file, indent = 3)   
out_file.close() 

out_file = open("graphEdgesGroupSizes.json", "w") 
json.dump({"nodes":nodeList, "links":edgeListGroupSizes}, out_file, indent = 3)   
out_file.close() 

out_file = open("graphEdgesOther.json", "w") 
json.dump({"nodes":nodeList, "links":edgeListOther}, out_file, indent = 3)   
out_file.close() 

In [24]:
pd.DataFrame(edgeListDatasets).to_csv("graphEdgesDatasets.csv", sep=";")
pd.DataFrame(edgeListBaselines).to_csv("graphEdgesBaselines.csv", sep=";")
pd.DataFrame(edgeListGroupConstruction).to_csv("graphEdgesGroupConstruction.csv", sep=";")
pd.DataFrame(edgeListEvalMetrics).to_csv("graphEdgesEvalMetrics.csv", sep=";")
pd.DataFrame(edgeListGroupSizes).to_csv("graphEdgesGroupSizes.csv", sep=";")
pd.DataFrame(edgeListOther).to_csv("graphEdgesOther.csv", sep=";")