In [30]:
# Goal
# to try and pull call for papers from popular sources so that we can store that data
# into a database, and then query based on keywords of interest
# ideally this will then be automated and availabble

# Known sources:
# PaperCall: https://www.papercall.io/events
# Sessionize: https://www.google.com/search?q=%22call+for+speakers/papers%22+site:sessionize.com+%222021%22+-%22Call+for+Speakers+is+closed%22
# thanks to Michael - look for the site map!https://sessionize.com/sitemap/events.xml
# Sands media: https://callforpapers.sandsmedia.com/
# Eventil: https://eventil.com/
# tulu.la: https://tulu.la/events/

from datetime import datetime
import requests
import json
from bs4 import BeautifulSoup
import warnings

warnings.filterwarnings('ignore')

class EventURL:
    def __init__(self, url, source):
        self.url = url
        self.source = source

class Event:
        def __init__(self, title, event_url, submit_url, cfp_closing, start_date, 
                     end_date, description, location, tags, source):
            self.title = title
            self.event_url = event_url
            self.submit_url = submit_url
            self.cfp_closing = cfp_closing
            self.start_date = start_date
            self.end_date = end_date
            self.description = description
            self.location = location
            self.tags = tags
            self.source = source

In [31]:
from neo4j import GraphDatabase
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [32]:

page_no = 0
event_list = [];

while True:
    page_no = page_no + 1
    
    url = "https://www.papercall.io/events?page=%s"%(page_no)
    htmlContent = requests.get(url, verify=False)
    data = htmlContent.text

    soup = BeautifulSoup(data, 'html.parser')

    #keep iterating through the pages until we've finished

    try:
        events = soup.find_all("div",class_="row event-list-detail")
        if len(events) == 0:
            break
    except:
        print("ouch")
        break
    
    for event in events:

        #get CfP closing date - quasi-test to see if event has closed
        try:
            cfp_closing = datetime.strptime(event.tbody.find("time")["datetime"][0:9], "%Y-%m-%d").date()
        except:
            continue

        
        #get event URL
        try:
            event_url = event.h4.find("a").getText()
        except:
            try:
                event_url = event.h3.find("a")["href"]
            except:
                event_url = "unknown"
            
        #get event title, dates, description and location, submit url
        info = event.find("div", class_="col-md-2 event-list-buttons")

        try:
            submit_url = info.find("a", class_="btn btn--green-l")["href"]
        except:
            submit_url = "unknown"
        
        try:
            start_date = datetime.strptime(info.find("var", class_="atc_date_start").getText(), "%B %d, %Y").date()
        except:
            start_date = "unknown"
        
        try:
            end_date = datetime.strptime(info.find("var", class_="atc_date_end").getText(), "%B %d, %Y").date()
        except:
            end_date = "unknown"
        
        try:
            title = info.find("var", class_="atc_title").getText()
        except:
                try:
                    title = event.h3.find("a").getText()
                except:
                    title = "unknown"
        
        try:
            description = info.find("var", class_="atc_description").getText()
        except:
            description = "unknown"
        
        try:
            location = info.find("var", class_="atc_location").getText()
        except:
            location = "unknown"

        #get the event tags if they exist
        tags = []
        try:
            tags = event.find("div", class_="col-md-11 col-sm-12").find_all("h4")[3].getText().replace("\n","").split(",")
            tags = list(filter(None,tags))
        except:
            {}

        #event_made = Event(title, event_url, submit_url, cfp_closing, start_date, end_date, description, location, tags)
        #print(event_made.title)
        event_list.append(Event(title, event_url, submit_url, cfp_closing, start_date, end_date, 
                                description, location, tags, "PaperCall"))

     




In [33]:
# Sessionize using sitemap
url = "https://sessionize.com/sitemap/events.xml"
sess_list = []

htmlContent = requests.get(url, verify=False)
data = htmlContent.text

soup = BeautifulSoup(data, 'html.parser')
events = soup.find_all("url")

for event in events:
    if event.find("priority").getText() == "0.8":
        sess_list.append(event.find("loc").getText())


In [35]:
#now get Sessionize data
for url in sess_list:    
    try:
        htmlContent = requests.get(url, verify=False)
        data = htmlContent.text
        soup = BeautifulSoup(data, 'html.parser')
    except:
        print(url)
        continue
    #print(soup)
    
    #has this event already closed?

    if soup.find("div", class_="alert alert-danger") is not None:
        continue
    
    #title, event_url, submit_url, cfp_closing, start_date, end_date, description, location, tags)
    try:
        title = soup.find("div", class_="col-md-6 animated fadeInLeft").find("h4").getText()
    except:
        #maybe 404?
        continue
    
    try:
        content = soup.find("div", class_="col-md-6 animated fadeInLeft")
    except:
        print("ouch2", url)
    try:
        dates = content.find_all("div", class_="col-sm-6 m-b-md")
        start_date = datetime.strptime(dates[0].find("h2").getText(), "%d %b %Y").date()
    except:
        print("ouch3", url)
        start_date = datetime.strptime(dates[0].find("h2").getText(), "%d %b %Y").date()
        print(start_date)
    try:
        end_date = datetime.strptime(dates[1].find("h2").getText(), "%d %b %Y").date()
        #print(end_date)
    except:
        end_date = start_date
    try:
        event_url = content.find("a").getText()
    except:
        event_url = "unknown"
    try:
        location = content.find("div", class_="col-sm-12 m-b-md").find("h2").getText()
    except:
        continue  
   
    
    try:    
        desc_items = content.find_all("div", class_="col-sm-12")
        description = "".join([item.text for item in desc_items])
    except:
        print("ouch5", url)
        
    try:
        content = soup.find("div", class_="col-md-6 animated fadeInRight")
        cfp_closing = datetime.strptime(content.find_all("div", class_="col-sm-6 m-b-sm")[1].find("h2").getText(), "%d %b %Y").date()
    except:
        print("ouch 6", url)
        
    event_list.append(Event(title, event_url, url, cfp_closing, start_date, end_date, 
                             description, location, [""], "Sessionize"))
    
       

In [39]:
#Connect up to the Neo4j instance
conn = Neo4jConnection(uri='bolt://54.152.43.56:7687', user='neo4j', pwd='')

# database clearout
conn.query("""MATCH (n) DETACH DELETE n""")

# set the indexes

try:
    query = """
        CREATE INDEX ON :Location(value);
        CREATE INDEX ON :Tag(value);
        CREATE INDEX ON :Synonym(value);
    """
    conn.query(query)
except:
    #indexes are probably already set
    print(sys.exc_info())
    
    
# Load the StackOverflow tags and synonyms
# For now, we'll switch out hyphens for spaces
try:
    print("loading SO tags")
    conn.query("""
        LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/lju-lazarevic/misc/main/sor.csv' AS row
        WITH row.Tag AS tag, split(row.synonyms, ';') AS syms
        MERGE (t:Tag {value:replace(tag, '-',' ')})
            ON CREATE SET t.source = 'StackOverflow'
        WITH t, syms
        FOREACH (n IN syms | 
            MERGE (s:Synonym {value:replace(n,'-',' ')})
            CREATE (t)-[:HAS_SYNONYM]->(s))""")
except:
    print(sys.exc_info())

  


loading SO tags
loading countries


In [40]:
#get stuff into neo
for event in event_list:
    try:
        #start creating!
        tag = list(filter(None, event.tags))
        params = {'title':event.title,
                  'eventURL':event.event_url,
                  'submitURL':event.submit_url, 
                  'startDate':event.start_date,
                  'endDate':event.end_date,
                  'description':event.description,
                  'source':event.source,
                  'cfpClosing':event.cfp_closing,
                  'lvalue':event.location.strip(),
                  'tags':tag}

        query = """CREATE (e:Event {title: $title, eventURL: $eventURL, submitURL: $submitURL, startDate: $startDate,
                            endDate: $endDate, description: $description, source: $source, cfpClosing: $cfpClosing})
                   MERGE (l:Location {value:$lvalue})
                   WITH e, l
                   CREATE (e)-[:IN_LOCATION]->(l)
                   WITH e
                   UNWIND $tags as tag
                   MERGE (t:Tag {value:tolower(trim(tag))})
                   CREATE (e)-[:HAS_TAG {count:1}]->(t)"""

        conn.query(query, parameters=params)

    except:
        print(event.submit_url)
        print(sys.exc_info())


In [42]:
# we can do all of this stuff in python, but just showing what's possible in the db too
# do some similarity mapping to tags

# start linking tags to events. We'll keep a count in the r.source property 
# for how often the tag occurs (e.g. to catch 'if')
try:
    conn.query("""
        MATCH (e:Event), (t:Tag)
        WITH e, t, tolower(" "+e.title+" " + e.description) AS text, 
            apoc.text.format("(\\b%s\\b|\\B %s\\b)", [replace(t.value, '.','\.'), replace(t.value, '.','\.')]) AS rp
        WITH e, t, size(apoc.text.regexGroups(text, rp)) as c 
            WHERE c>0 AND NOT (e)-[:HAS_TAG]->(t)
        MERGE (e)-[r:PREDICTED_TAG]->(t)
            ON CREATE SET r.count = c, r.source = 'tag'
            ON MATCH SET r.count = r.count+c""")
except:
    print("shrug predict tag")
    
# now use the synonyms to try and find any more. We'll keep a count in the r.source property 
# for how often the tag occurs (e.g. to catch 'if')
try:
    conn.query("""
        MATCH (e:Event), (t:Tag)--(s:Synonym)
        WITH e, t, tolower(" "+e.title+" " + e.description) AS text, 
            apoc.text.format("(\\b%s\\b|\\B %s\\b)", [replace(s.value, '.','\.'), replace(s.value, '.','\.')]) AS rp
        WITH e, t, size(apoc.text.regexGroups(text, rp)) as c 
            WHERE c>0 AND NOT (e)-[:HAS_TAG]->(t)
        MERGE (e)-[r:PREDICTED_TAG]->(t)
            ON CREATE SET r.count = c, r.source = 'synonymn'
            ON MATCH SET r.count = r.count+c""")
except:
        print(sys.exc_info())

#check to see whether there are similar tags from SO and from PC and reduce

#do some basic stats to clean up predicted tags

#Do we have a crazy ratio between a count of a predicted tag and how often it occurs?
try:
    conn.query("""
        MATCH (e:Event)-[r:PREDICTED_TAG]->(t:Tag)
        WITH r, t, tofloat(sum(r.count))/tofloat(count(e)) AS ratio WHERE ratio >10 
        WITH r, t
        DELETE r""")
except:
        print(sys.exc_info())
        
#Does a predicted tag appear in over 20% of the talks? 
try:
    conn.query("""
        MATCH (e:Event)
        WITH count(e) AS events
        MATCH (e:Event)-[r:PREDICTED_TAG]->(t:Tag)
        WITH t, count(e) AS occ, events
        WITH t, occ, tofloat(occ)/tofloat(events) AS ratio WHERE ratio >0.2
        MATCH (e:Event)-[r:PREDICTED_TAG]->(t:Tag)
        DELETE r""")
except:
        print(sys.exc_info())

In [None]:
#eyeballing the data, do some simple clean-up of tags:
match (e:Event)-[r]->(t:Tag)
WITH r, t, tofloat(sum(r.count))/tofloat(count(e)) as ratio WHERE ratio >10 
DELETE r

#where occurence/freq ration is greater than 20%, let's cut (if predicted!)
MATCH (e:Event)
WITH count(e) AS events
MATCH (e:Event)-[r:PREDICTED_TAG]->(t:Tag)
WITH t, count(e) AS occ, events
WITH t, occ, tofloat(occ)/tofloat(events) AS ratio WHERE ratio >0.2
MATCH (e:Event)-[r:PREDICTED_TAG]->(t:Tag)
DELETE r

In [None]:
#Queries for the data

CALL gds.graph.create.cypher("similar", 
"MATCH (e:Event) RETURN id(e) AS id",
"MATCH (e1:Event)-->(t:Tag)<--(e2:Event) RETURN id(e1) AS source, id(e2) AS target")

call gds.nodeSimilarity.stream("similar")
YIELD node1, node2, similarity
RETURN gds.util.asNode(node1).title, gds.util.asNode(node2).title, similarity order by similarity desc

call gds.nodeSimilarity.stream("similar")
YIELD node1, node2, similarity
WITH gds.util.asNode(node1) as n1, gds.util.asNode(node2) as n2, similarity WHERE similarity >= 0.8
CREATE (n1)-[:SIMILAR_TO]->(n2)

call gds.nodeSimilarity.stream("similar")
YIELD node1, node2, similarity
WITH gds.util.asNode(node1) as n1, gds.util.asNode(node2) as n2, similarity WHERE similarity >= 0.8 AND id(n1)>id(n2)
CREATE (n1)-[:SIMILAR_TO]->(n2)

#similar events
match (e:Event)-[:SIMILAR_TO]->(e2)
WHERE NOT  ()-->(e)
WITH e
MATCH (e)-[:SIMILAR_TO*]->(e1)
return distinct  e.title, collect(distinct e1.title)