In [1]:
from graphdatascience import GraphDataScience
import pandas as pd
import configparser

### Neo4j Settings
The NEO4J_PROPERTIES_FILE is an ini configuration file for Neo4j properties so this notebook can connect to your Neo4j instance and load data. The ini file should be formatted as follows

```
[NEO4J]
PASSWORD=<password>
USERNAME=<username, is 'neo4j' by default>
HOST=<host uri>
```

In [2]:
config = configparser.RawConfigParser()
config.read('/Users/zachblumenfeld/devtools/aura-mind-2.ini')
HOST = config['NEO4J']['HOST']
USERNAME = config['NEO4J']['USERNAME']
PASSWORD = config['NEO4J']['PASSWORD']

gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=True)

### Clean Up from Last Time

In [3]:
# remove old properties
_ = gds.run_cypher('MATCH (n:News) REMOVE n.segmentEmbeddings')

In [4]:
# drop all the projected graphs
g_names = gds.graph.list().graphName.tolist()

for g_name in g_names:
    g = gds.graph.get(g_name)
    gds.graph.drop(g)

### Stage Graph

In [5]:
# create user viewed categories and sub-categories.  THis will be useful for deciding how to label the K-Means clusters
_ = gds.run_cypher('''
MATCH  (u:User)-[:CLICKED]->(:News)
WITH DISTINCT u
CALL {
    WITH u
    MATCH(u)-[:CLICKED|HISTORICALLY_CLICKED]->(n:News)
    WITH u, collect(DISTINCT n.category) AS viewCategories
    SET u.viewCategories = viewCategories
} IN TRANSACTIONS OF 10000 rows
''')

_ = gds.run_cypher('''
MATCH  (u:User)-[:CLICKED]->(:News)
WITH DISTINCT u
CALL {
    WITH u
    MATCH(u)-[:CLICKED|HISTORICALLY_CLICKED]->(n:News)
    WITH  u, n.category + '-' + n.subcategory AS viewSubcategory
    WITH u, collect(DISTINCT viewSubcategory) AS viewSubcategories
    SET u.viewSubcategories = viewSubcategories
} IN TRANSACTIONS OF 10000 rows
''')