![bse_logo_textminingcourse](https://bse.eu/sites/default/files/bse_logo_small.png)

# Big Data Management - Assignment 2
## Graph Databases

### by Natalia Bertrán, Clarice Mottet

0. **[Part 0: Set Up and Create Sample Data](#part0)**
- **Objective**: Initialize programming environment.
  - Create sample data as defined in the previous problem


## <a id='part0'>Part 0: Set Up and Create Sample Data</a>


- Initialize programming environment

In [1]:
#libraries
import pandas as pd
import numpy as np
import py2neo
from py2neo import Graph
import datetime
import time
import json
from faker import Faker
import pandas as pd
import numpy as np
import random
from neo4j import GraphDatabase

import sys
sys.path.append('Inputs/')

random.seed(42)

  - Create sample data as defined in the homework assignment

In [2]:
#global variables for creating fake data - increase if needed

N_PAPER = 250
N_AUTHOR = 50
N_EDITION = 10
N_CONFERENCE = 2
N_VOLUME = 10
N_JOURNAL = 4
N_YEAR = 5
N_KEYWORD = 100
N_UNIVERSITIES = 4

In [40]:
#Create fake node data

dict_nodes = {}
fake = Faker(['en_US'])

node_type = 'paper'
size = N_PAPER
dict_nodes[node_type] = []
list_data = list(set([fake.paragraph(nb_sentences=1) for iter_ in range(int(size*1.2))]))
list_data_ = list(set([fake.paragraph(nb_sentences=5) for iter_ in range(int(size*1.2))]))
list_titles = []
for iter_ in range(size):
    dict_nodes[node_type].append({'title':list_data[iter_], 'abstract':list_data_[iter_]})
    list_titles.append(list_data[iter_])

node_type = 'author'
size = N_AUTHOR
dict_nodes[node_type] = []
list_data = list(set([fake.name() for iter_ in range(int(size*1.2))]))
for iter_ in range(size):
    dict_nodes[node_type].append({'name':list_data[iter_]})

node_type = 'edition'
size = N_EDITION
dict_nodes[node_type] = []
list_data = list(set([fake.city() for iter_ in range(int(size*1.2))]))
for iter_ in range(size):
    dict_nodes[node_type].append({'city':list_data[iter_]})

node_type = 'conference'
size = N_CONFERENCE
dict_nodes[node_type] = []
list_data = list(set(['Conference of '+fake.sentence(nb_words=4) for iter_ in range(int(size*1.2))]))
for iter_ in range(size):
    dict_nodes[node_type].append({'conference':list_data[iter_]})

node_type = 'volume'
size = N_VOLUME
dict_nodes[node_type] = []
list_data = list(set([str(iter_) for iter_ in range(int(size*1.2))]))
for iter_ in range(size):
    dict_nodes[node_type].append({'volume':list_data[iter_]})

node_type = 'journal'
size = N_JOURNAL
dict_nodes[node_type] = []
list_data = list(set(['Journal of '+fake.sentence(nb_words=4) for iter_ in range(int(size*1.2))]))
for iter_ in range(size):
    dict_nodes[node_type].append({'journal':list_data[iter_]})

node_type = 'year'
size = N_YEAR
dict_nodes[node_type] = []
list_data = [iter_ for iter_ in range(2018,2018+N_YEAR)]
for iter_ in range(size):
    dict_nodes[node_type].append({'year':list_data[iter_]})

node_type = 'keyword'
size = N_KEYWORD
dict_nodes[node_type] = []
str_titles = ' '.join(list_titles)
str_titles = str_titles.replace('.','').lower()
df_list = pd.DataFrame({'word':str_titles.split()})
df_list['count'] = 1
df_list['count'] = df_list.groupby(by = ['word'])['count'].transform('sum')
df_list.drop_duplicates(subset = ['word'], inplace = True)
df_list.sort_values(by = ['count'], ascending = [False], inplace = True)
df_list = df_list[df_list['word'].str.len() > 5]
keyword_list = list(df_list['word'])
for iter_ in range(size):
    dict_nodes[node_type].append({'keyword':keyword_list[iter_]})

node_type = 'university'
size = N_UNIVERSITIES
dict_nodes[node_type] = []
list_data = list(set(['University of '+fake.sentence(nb_words=2) for iter_ in range(int(size*1.2))]))
for iter_ in range(size):
    dict_nodes[node_type].append({'university':list_data[iter_]})

In [41]:
#Create fake edge connections

dict_edges = {}

dict_edges['e2c_appears_in'] = []
list_designate = [random.randint(0,N_CONFERENCE-1) for iter_ in range(N_EDITION)]
for edition_index_, conference_designation_ in enumerate(list_designate):
    dict_edges['e2c_appears_in'].append({'edition':dict_nodes['edition'][edition_index_]['city'], 'conference':dict_nodes['conference'][conference_designation_]['conference']})

dict_edges['v2j_appears_in'] = []
list_designate = [random.randint(0,N_JOURNAL-1) for iter_ in range(N_JOURNAL)]
for volume_index_, journal_designation_ in enumerate(list_designate):
    dict_edges['v2j_appears_in'].append({'volume':dict_nodes['volume'][volume_index_]['volume'], 'journal':dict_nodes['journal'][journal_designation_]['journal']})

dict_edges['e2y_occured_in'] = []
for edition_index_ in range(N_EDITION):
    year_designation_ = edition_index_ % N_YEAR
    dict_edges['e2y_occured_in'].append({'edition':dict_nodes['edition'][edition_index_]['city'], 'year':dict_nodes['year'][year_designation_]['year']})

dict_edges['v2y_occured_in'] = []
for volume_index_ in range(N_VOLUME):
    year_designation_ = volume_index_ % N_YEAR
    dict_edges['v2y_occured_in'].append({'volume':dict_nodes['volume'][volume_index_]['volume'], 'year':dict_nodes['year'][year_designation_]['year']})

dict_edges['p2k_has'] = []
for paper_index_ in range(N_PAPER):
    paper_title = dict_nodes['paper'][paper_index_]['title']
    for keyword_index_ in range(N_KEYWORD):
        keyword = dict_nodes['keyword'][keyword_index_]['keyword']
        if keyword in paper_title:
            dict_edges['p2k_has'].append({'title':paper_title, 'keyword':keyword})

dict_edges['p2e_published_in'] = []
dict_edges['p2v_published_in'] = []
list_designate = [random.randint(0,1) for iter_ in range(N_PAPER)]
for paper_index_ in range(N_PAPER):
    paper_title = dict_nodes['paper'][paper_index_]['title']
    if list_designate[paper_index_] == 0:
        edition_designation_ = paper_index_ % N_EDITION
        dict_edges['p2e_published_in'].append({'title':paper_title, 'edition':dict_nodes['edition'][edition_designation_]['city']})
    else:
        volume_designation_ = paper_index_ % N_VOLUME
        dict_edges['p2v_published_in'].append({'title':paper_title, 'volume':dict_nodes['volume'][volume_designation_]['volume']})

dict_edges['p2p_cites'] = []
list_n_cites = [random.randint(3,12) for iter_ in range(N_PAPER)]
for paper_index_ in range(N_PAPER):
    paper_title = dict_nodes['paper'][paper_index_]['title']
    list_cites = list(set([random.randint(0,N_PAPER-1) for iter_ in range(list_n_cites[paper_index_])]))
    if paper_index_ in list_cites:
        list_cites.remove(paper_index_)
    for paper_ref_ in list_cites:
        paper_ref_title = dict_nodes['paper'][paper_ref_]['title']
        dict_edges['p2p_cites'].append({'title':paper_title, 'title_cited':paper_ref_title})

dict_edges['a2p_wrote'] = []
dict_edges['a2p_reviewed'] = []
dict_edges['a2p_reviewed_detail'] = []

for paper_index_ in range(N_PAPER):
    if paper_index_ < N_AUTHOR:
        author_index_ = paper_index_
    else:
        author_index_ = random.randint(0, N_AUTHOR-1)

    #assign writer to paper
    dict_edges['a2p_wrote'].append({'name':dict_nodes['author'][author_index_]['name'], 'title':dict_nodes['paper'][paper_index_]['title']})

    #assign reviewers (cannot be writer)
    review_author_list_  = list(set([random.randint(0, N_AUTHOR-1) for i_ in range(N_AUTHOR)]))
    if author_index_ in review_author_list_:
        review_author_list_.remove(author_index_)

    n_reviews = random.randint(1,10)
    for iter_ in range(n_reviews):
        dict_edges['a2p_reviewed'].append({'name':dict_nodes['author'][review_author_list_[iter_]]['name'], 'title':dict_nodes['paper'][paper_index_]['title']})

        acceptance_status_options = [1, 0]
        probabilities = [0.9, 0.1]
        acceptance_status = random.choices(acceptance_status_options, weights=probabilities, k=1)[0]
        dict_edges['a2p_reviewed_detail'].append({'name':dict_nodes['author'][review_author_list_[iter_]]['name'], 'title':dict_nodes['paper'][paper_index_]['title'], 'review':fake.paragraph(nb_sentences=5), 'acceptance_status': acceptance_status})



In [10]:
dict_edges['a2u_affiliation'] = []
for author_index_ in range(N_AUTHOR):
    university_index_ = random.randint(0, N_UNIVERSITIES-1)
    dict_edges['a2u_affiliation'].append({'name':dict_nodes['author'][author_index_]['name'], 'university':dict_nodes['university'][university_index_]['university']})

In [14]:
#Export the fake data

file = 'node.json'
nodes_out = f"Inputs/zjson_files/{file}"
with open(nodes_out, 'w') as f:
    json.dump(dict_nodes, f, indent=2)

file = 'edges.json'
edges_out = f"Inputs/zjson_files/{file}"
with open(edges_out, 'w') as f:
    json.dump(dict_edges, f, indent=2)


In [4]:
#Import node and edge dictionary back in
file = 'node.json'
nodes_out = f"Inputs/zjson_files/{file}"
with open(nodes_out, 'r') as file:
    dict_nodes = json.load(file)

file = 'edges.json'
edges_out = f"Inputs/zjson_files/{file}"
with open(edges_out, 'r') as file:
    dict_edges = json.load(file)


In [16]:
#export out the node data as CSV individually

for node_type in dict_nodes.keys():
    list_all = []
    for iter_ in range(len(dict_nodes[node_type])):
        dict_ = dict_nodes[node_type][iter_]
        df_ = pd.DataFrame(dict_, index = [iter_])
        list_all.append(df_)
    file_out = f"Inputs/nodes/{node_type}.csv"
    df_all = pd.concat(list_all, ignore_index = True)
    df_all.to_csv(file_out, index = False)


In [18]:
#export out the edge data as CSV individually

for edge_type in dict_edges.keys():
    list_all = []
    for iter_ in range(len(dict_edges[edge_type])):
        dict_ = dict_edges[edge_type][iter_]
        df_ = pd.DataFrame(dict_, index = [iter_])
        list_all.append(df_)
    file_out = f"Inputs/edges/{edge_type}.csv"
    df_all = pd.concat(list_all, ignore_index = True)
    df_all.to_csv(file_out, index = False)