# Graph Data Science Meets Python

In [1]:
import pandas as pd
import configparser
pd.set_option('display.max_rows', 12)

# Recommendation System - News Content
## Using KNN to Improve Recommendation Efficiency

In [2]:
config = configparser.RawConfigParser()
config.read('/Users/zachblumenfeld/devtools/aura-mind-2.ini')
HOST = config['NEO4J']['HOST']
USERNAME = config['NEO4J']['USERNAME']
PASSWORD = config['NEO4J']['PASSWORD']

## Connect to Graph Data Science (GDS)

In [3]:
from graphdatascience import GraphDataScience

# Use Neo4j URI and credentials according to your setup
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=True)

## News Recommendations Without GDS

- Returns many potential recommendations - can lack focus and be difficult to prioritize
- Large queries - can be slow for production use cases
- Results set grows with size of data - can be difficult to scale

In [4]:
USER_ID = "U218584"

In [5]:
%%time
gds.run_cypher('''
    MATCH (u1:User {userId: $userId})
           -[r1:CLICKED]->(n0:RecentNews)
           <-[r2:CLICKED]-(u2:User)
           -[r3:CLICKED]->(n:RecentNews)
    RETURN u1.userId AS userId,
           count(DISTINCT n0) AS NewsArticlesClicked,
           count(DISTINCT u2) AS UsersAccountsTraversed,
           count(DISTINCT n) AS RecommendationCount
    ''', params={'userId': USER_ID})

CPU times: user 24.6 ms, sys: 4.88 ms, total: 29.5 ms
Wall time: 2.38 s


Unnamed: 0,userId,NewsArticlesClicked,UsersAccountsTraversed,RecommendationCount
0,U218584,6,63019,10998


In [6]:
%%time
gds.run_cypher('''
    MATCH (u1:User {userId: $userId})
           -[r1:CLICKED]->(n0:RecentNews)
           <-[r2:CLICKED]-(u2:User)
           -[r3:CLICKED]->(n:RecentNews)
    WITH DISTINCT n
    RETURN n.newsId as newsId,
        n.title AS title,
        n.category AS category,
        n.subcategory As subcategory
    ''', params={'userId': USER_ID})

CPU times: user 420 ms, sys: 16.2 ms, total: 437 ms
Wall time: 2.41 s


Unnamed: 0,newsId,title,category,subcategory
0,N29160,Opinion: Colin Kaepernick is about to get what...,sports,football_nfl
1,N26703,"Some believe Mason Rudolph, hit in head with h...",sports,football_nfl
2,N40742,THEN AND NOW: What all your favorite '90s star...,entertainment,celebrity
3,N112662,Arian Foster calls out NFL fans for being hypo...,sports,football_nfl
4,N94572,'It's not over': Sarah Palin says she is fight...,news,newsus
...,...,...,...,...
10993,N79709,Why is citrus one of the 5 Cs of Arizona? What...,tv,tvnews
10994,N21436,"Phoenix restaurant Pa'La: Refined, affordable ...",foodanddrink,newstrends
10995,N102470,Battered Largo fights way into second round,sports,football_ncaa
10996,N29438,The TV network Freeform is filming LGBTQ+ rom-...,movies,movienews


## Scale Similarity Inferences with GDS Node Embeddings and K-Nearest Neighbor (KNN) Similarity
Memory-based recommenders are notoriously difficult to scale with traditional data science methods.
With GDS however, we can scale memory-based recommenders to big data using robust embeddings and KNN similarity algorithms to automatically identify highly significant items.

In [7]:
g0, _ = gds.graph.project('embedding-projection', ['User', 'News'], {
    'CLICKED':{'orientation':'UNDIRECTED','properties': {
        'weight': {'property': 'confidence', 'defaultValue': 1.0}}},
    'HISTORICALLY_CLICKED':{'orientation':'UNDIRECTED', 'properties': {
        'weight': {'property': 'confidence', 'defaultValue': 0.1}}}})

gds.fastRP.mutate(g0, mutateProperty='embedding', embeddingDimension=256, randomSeed=7474, 
                  relationshipWeightProperty='weight')
gds.graph.writeNodeProperties(g0, ["embedding"], ["News"])

g1, _ = gds.graph.project('cf-projection', {'RecentNews':{'properties':['embedding']}},'*')

knn_stats = gds.knn.write(g1, nodeProperties=['embedding'], 
                  writeRelationshipType='USERS_ALSO_LIKED', writeProperty='score', 
                  sampleRate=1.0,maxIterations=1000);

## News Recommendations Post GDS KNN Application

- Returns refined set of recommendations - personalized and relevant to user
- Smaller queries - Fast for enterprise use cases
- Results size remains constant & focused as data size increases - built for scale

In [8]:
%%time
gds.run_cypher( '''
    MATCH(u:User {userId: $userId})
        -[:CLICKED]->(n0:RecentNews)
        -[s:USERS_ALSO_LIKED]->(n:News)
    RETURN u.userId AS userId,
           count(DISTINCT n0) AS NewsArticlesClicked,
           0 AS UsersAccountsTraversed,
           count(DISTINCT n) AS RefinedRecommendationCount
    ''', params={'userId': USER_ID})

CPU times: user 2.21 ms, sys: 1.11 ms, total: 3.32 ms
Wall time: 278 ms


Unnamed: 0,userId,NewsArticlesClicked,UsersAccountsTraversed,RefinedRecommendationCount
0,U218584,6,0,60


In [9]:
%%time
gds.run_cypher( '''
    MATCH(u:User {userId: $userId})
        -[:CLICKED]->(n0:RecentNews)
        -[s:USERS_ALSO_LIKED]->(n:News)
    WITH DISTINCT n, sum(s.score) AS totalScore
    RETURN n.newsId as newsId,
        n.title AS title,
        n.category AS category,
        n.subcategory As subcategory,
        totalScore ORDER BY totalScore DESC
    ''', params={'userId': USER_ID})

CPU times: user 4.97 ms, sys: 1.18 ms, total: 6.15 ms
Wall time: 298 ms


Unnamed: 0,newsId,title,category,subcategory,totalScore
0,N81058,New Mexico game vs. Air Force rescheduled afte...,sports,football_ncaa,0.889312
1,N26968,'Beautiful boys': Victims in Mexico ambush rem...,news,newsworld,0.873203
2,N107322,Reality television star Kevin O'Leary and his ...,tv,tv-celebrity,0.872507
3,N126027,Woman accused of embezzling from Camp Fire vic...,news,newscrime,0.869965
4,N86231,New images capture missing college student on ...,news,newsus,0.867730
...,...,...,...,...,...
55,N41143,10 of the best fast-food burger chains across ...,foodanddrink,restaurantsandnews,0.731043
56,N50013,These chains offer the best value in fast food...,foodanddrink,restaurantsandnews,0.730796
57,N46113,"Watch This Gross 300,000-Mile Chevy Work Truck...",autos,autosenthusiasts,0.729109
58,N83623,Hidden Camera Captures Cat And Baby Having The...,lifestyle,lifestylebuzz,0.729007


In [10]:
g0.drop()
g1.drop()

In [11]:
_ = gds.run_cypher('MATCH (:RecentNews)-[r:USERS_ALSO_LIKED]->() DELETE r')