# Graph Data Science Meets Python

In [1]:
import pandas as pd
import configparser
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.width', 0)

# Recommendation System - News Content
## Using KNN to Improve Recommendation Efficiency

In [2]:
config = configparser.RawConfigParser()
config.read('/Users/zachblumenfeld/devtools/aura-mind-2.ini')
HOST = config['NEO4J']['HOST']
USERNAME = config['NEO4J']['USERNAME']
PASSWORD = config['NEO4J']['PASSWORD']

## Connect to Graph Data Science (GDS)

In [3]:
from graphdatascience import GraphDataScience

# Use Neo4j URI and credentials according to your setup
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=True)

## Summary Statistics

In [4]:
# total node counts
gds.run_cypher( '''
    CALL apoc.meta.stats()
    YIELD labels AS nodeCounts
    UNWIND keys(nodeCounts) AS label
    WITH label, nodeCounts[label] AS nodeCount
    WHERE label IN ['User','News']
    RETURN label, nodeCount
''')

Unnamed: 0,label,nodeCount
0,User,750434
1,News,104151


In [5]:
# total relationship counts
gds.run_cypher( '''
    CALL apoc.meta.stats()
    YIELD relTypesCount as relationshipCounts
    UNWIND keys(relationshipCounts) AS type
    WITH type, relationshipCounts[type] AS relationshipCount
    WHERE type IN ['CLICKED','HISTORICALLY_CLICKED']
    RETURN type, relationshipCount
''')

Unnamed: 0,type,relationshipCount
0,CLICKED,3958501
1,HISTORICALLY_CLICKED,13580648


## News Recommendations Without GDS

- Returns many potential recommendations - can lack focus and be difficult to prioritize
- Large queries - can be slow for production use cases
- Results set grows with size of data - can be difficult to scale

In [6]:
USER_ID = "U218584"

In [7]:
%%time
gds.run_cypher('''
    MATCH (u1:User {userId: $userId})
           -[r1:CLICKED]->(n0:RecentNews)
           <-[r2:CLICKED]-(u2:User)
           -[r3:CLICKED]->(n:RecentNews)
    RETURN u1.userId AS userId,
           count(DISTINCT n0) AS NewsArticlesClicked,
           count(DISTINCT u2) AS UsersAccountsTraversed,
           count(DISTINCT n) AS RecommendationCount
    ''', params={'userId': USER_ID})

CPU times: user 2.84 ms, sys: 1.33 ms, total: 4.17 ms
Wall time: 1.53 s


Unnamed: 0,userId,NewsArticlesClicked,UsersAccountsTraversed,RecommendationCount
0,U218584,6,63019,10998


In [8]:
%%time
gds.run_cypher('''
    MATCH (u1:User {userId: $userId})
           -[r1:CLICKED]->(n0:RecentNews)
           <-[r2:CLICKED]-(u2:User)
           -[r3:CLICKED]->(n:RecentNews)
    WITH DISTINCT n
    RETURN n.newsId as newsId,
        n.title AS title,
        n.category AS category,
        n.subcategory As subcategory
    ''', params={'userId': USER_ID})

CPU times: user 581 ms, sys: 21.5 ms, total: 602 ms
Wall time: 4.83 s


Unnamed: 0,newsId,title,category,subcategory
0,N29160,Opinion: Colin Kaepernick is about to get what he deserves: a chance,sports,football_nfl
1,N26703,"Some believe Mason Rudolph, hit in head with his own helmet, isn't getting enough blame",sports,football_nfl
2,N40742,THEN AND NOW: What all your favorite '90s stars are doing today,entertainment,celebrity
3,N112662,Arian Foster calls out NFL fans for being hypocrites over Myles Garrett attack,sports,football_nfl
4,N94572,'It's not over': Sarah Palin says she is fighting to repair her marriage,news,newsus
...,...,...,...,...
10993,N79709,Why is citrus one of the 5 Cs of Arizona? What led to its decline? Valley 101 digs in,tv,tvnews
10994,N21436,"Phoenix restaurant Pa'La: Refined, affordable tapas, bread, seafood in Phoenix",foodanddrink,newstrends
10995,N102470,Battered Largo fights way into second round,sports,football_ncaa
10996,N29438,The TV network Freeform is filming LGBTQ+ rom-com in Chicago,movies,movienews


## Scale Similarity Inferences with GDS Node Embeddings and K-Nearest Neighbor (KNN) Similarity
Memory-based recommenders are notoriously difficult to scale with traditional data science methods.
With GDS however, we can scale memory-based recommenders to big data using robust embeddings and KNN similarity algorithms to automatically identify highly significant items.

In [9]:
g0, _ = gds.graph.project('embedding-projection', ['User', 'News'], {
    'CLICKED':{'orientation':'UNDIRECTED','properties': {
        'weight': {'property': 'confidence', 'defaultValue': 1.0}}},
    'HISTORICALLY_CLICKED':{'orientation':'UNDIRECTED', 'properties': {
        'weight': {'property': 'confidence', 'defaultValue': 0.1}}}})

gds.fastRP.mutate(g0, mutateProperty='embedding', embeddingDimension=256, randomSeed=7474, 
                  relationshipWeightProperty='weight')
gds.graph.writeNodeProperties(g0, ["embedding"], ["News"])

g1, _ = gds.graph.project('cf-projection', {'RecentNews':{'properties':['embedding']}},'*')

knn_stats = gds.knn.write(g1, nodeProperties=['embedding'], 
                  writeRelationshipType='USERS_ALSO_LIKED', writeProperty='score', 
                  sampleRate=1.0,maxIterations=1000);

If you are curious this is what the embeddings look like, they are just vectors of floating point numbers. In this case they are are 256 numbers long as specified in the embeddingDimension parameter above.

In [10]:
gds.run_cypher('MATCH(n:RecentNews) RETURN n.newsId, n.embedding LIMIT 3')

Unnamed: 0,n.newsId,n.embedding
0,N88753,"[0.10630623251199722, 0.1723296195268631, -0.1171380877494812, -0.027835553511977196, -0.06641528755426407, 0.05011148005723953, 0.1398780792951584, -0.20932942628860474, 0.03788360580801964, 0.11861199140548706, -0.054010067135095596, -0.15462571382522583, 0.016326608136296272, 0.1190885603427887, -0.03897024691104889, 0.05449594557285309, -0.0648602545261383, -0.06124555319547653, 0.09067648649215698, 0.02814493328332901, -0.14549581706523895, 0.2743493318557739, 0.17065733671188354, 0.000..."
1,N99744,"[-0.08940580487251282, -0.0877138078212738, 0.011865537613630295, -0.10249189287424088, -0.16452106833457947, 0.06788899749517441, 0.07351496070623398, -0.05297347903251648, 0.06831544637680054, -0.059567954391241074, 0.01696055755019188, -0.023166803643107414, 0.01591249741613865, 0.04748855531215668, 0.06959734112024307, 0.18919958174228668, -0.018619976937770844, -0.1406448930501938, 0.06486805528402328, 0.029595350846648216, 0.07380983233451843, 0.15983542799949646, 0.054380059242248535,..."
2,N17957,"[0.13875526189804077, -0.23850543797016144, 0.0017148107290267944, -0.07496193051338196, -0.18084552884101868, -0.014842547476291656, 0.08821369707584381, -0.13480809330940247, 0.09693685173988342, 0.058435939252376556, -0.09078291058540344, -0.1293138563632965, 0.0802997499704361, 0.047641463577747345, 0.025296349078416824, 0.1664532721042633, 0.11257122457027435, -0.06640288233757019, -0.026818357408046722, 0.1013699546456337, -0.079646036028862, 0.13696950674057007, 0.06526794284582138, -..."


## News Recommendations Post GDS KNN Application

- Returns refined set of recommendations - personalized and relevant to user
- Smaller queries - Fast for enterprise use cases
- Results size remains constant & focused as data size increases - built for scale

In [11]:
%%time
gds.run_cypher( '''
    MATCH(u:User {userId: $userId})
        -[:CLICKED]->(n0:RecentNews)
        -[s:USERS_ALSO_LIKED]->(n:News)
    RETURN u.userId AS userId,
           count(DISTINCT n0) AS NewsArticlesClicked,
           0 AS UsersAccountsTraversed,
           count(DISTINCT n) AS RefinedRecommendationCount
    ''', params={'userId': USER_ID})

CPU times: user 3.83 ms, sys: 1.24 ms, total: 5.07 ms
Wall time: 339 ms


Unnamed: 0,userId,NewsArticlesClicked,UsersAccountsTraversed,RefinedRecommendationCount
0,U218584,6,0,59


In [12]:
%%time
gds.run_cypher( '''
    MATCH(u:User {userId: $userId})
        -[:CLICKED]->(n0:RecentNews)
        -[s:USERS_ALSO_LIKED]->(n:News)
    WITH DISTINCT n, sum(s.score) AS totalScore
    RETURN n.newsId as newsId,
        n.title AS title,
        n.category AS category,
        n.subcategory As subcategory,
        totalScore ORDER BY totalScore DESC
    ''', params={'userId': USER_ID})

CPU times: user 6.28 ms, sys: 1.44 ms, total: 7.72 ms
Wall time: 384 ms


Unnamed: 0,newsId,title,category,subcategory,totalScore
0,N35012,"Here Are the 2020 MotorTrend Car, Truck, and SUV of the Year Contenders and Finalists",autos,autosresearch,1.501173
1,N81058,New Mexico game vs. Air Force rescheduled after lineman Nahje Flowers' death,sports,football_ncaa,0.889312
2,N26968,'Beautiful boys': Victims in Mexico ambush remembered at funerals,news,newsworld,0.873203
3,N107322,Reality television star Kevin O'Leary and his wife were sued Wednesday for wrongful deaths in a boat crash in Canada's backwoods.,tv,tv-celebrity,0.872507
4,N126027,Woman accused of embezzling from Camp Fire victim surrenders,news,newscrime,0.869965
...,...,...,...,...,...
54,N78315,13 Ways to Prep Your Yard Before Snow Season,lifestyle,lifestylehomeandgarden,0.735275
55,N35689,60 Clever Uses for Salt That Don't Involve Cooking,lifestyle,lifestylehomeandgarden,0.731840
56,N48926,"For the first time ever, a US cheese is named best in the world",foodanddrink,foodnews,0.731065
57,N53785,Papa John's introduces first new pizza crust flavor in 35 years,foodanddrink,foodnews,0.730864


## Recommendations Based on Latest Viewed Content

The above recommendations traverse all user clicks. However, depending on use case, you may need to base recommendations on just the latest items the user interacted with. With the embeddings + KNN approach, you can use Cypher to customize the range of user activity.

Below is an example of using just the last clicked item for recommendation. This produces more focused results.

In [13]:
# Last clicked item
gds.run_cypher('''
    MATCH (u:User {userId:$userId})-[r:CLICKED]->(:RecentNews) 
    WITH u, max(r.impressionTime) AS maxImpressionTime
    MATCH (u)-[r:CLICKED]->(n:RecentNews) 
    WHERE r.impressionTime = maxImpressionTime
    RETURN n.newsId as newsId, 
        n.title AS title, 
        n.category AS category,
        n.subcategory As subcategory,
        r.impressionTime AS impressionTime
    ''', params={'userId': USER_ID})

Unnamed: 0,newsId,title,category,subcategory,impressionTime
0,N110709,2020 Ford Mustang Shelby GT350 vs. GT500: Which Is the Better Sports Car?,autos,autosenthusiasts,2019-11-15T05:48:40.000000000+00:00


In [14]:
# Recommendations based on last clicked item
gds.run_cypher('''
    MATCH (u:User {userId:$userId})-[r:CLICKED]->(:RecentNews) 
    WITH u, max(r.impressionTime) AS maxImpressionTime
    MATCH (u)-[r:CLICKED]->(n:RecentNews) 
    WHERE r.impressionTime = maxImpressionTime
    WITH n
    MATCH(n)-[s:USERS_ALSO_LIKED]->(similarNews:News)
    RETURN DISTINCT similarNews.newsId as newsId,
        similarNews.title AS title,
        similarNews.abstract AS abstract,
        similarNews.category AS category,
        similarNews.subcategory As subcategory,
        sum(s.score) AS totalScore
        ORDER BY totalScore DESC
    ''', params = {'userId': USER_ID})

Unnamed: 0,newsId,title,abstract,category,subcategory,totalScore
0,N129143,2020 Shelby GT500 Aero Performance & Cooling,Supercomputers and 3-D Printing are secrets to the all-new Mustang Shelby GT500's high performance.,autos,autosenthusiasts,0.835678
1,N65812,Watch! 2020 Ford Mustang Shelby GT500 Run 10.61 in ¼ Mile,"How does a 10.61 at 133 mph quarter-mile sound in the 2020 Shelby GT500? Even at the hands of a novice, the new Shelby GT500 is a solid 10-second car.",autos,autossports,0.828509
2,N59026,2020 Performance Car of the Year,"Eleven worthy competitors, one surprising winner.",autos,autosresearch,0.824935
3,N93439,2020 Ford Mustang Shelby GT500 Review: This Changes Everything,"42.875% better than the already brilliant GT350, it's a world-class supercar-killer",autos,autossports,0.824524
4,N119103,Ford Reveals Acceleration Times for 760-HP Mustang Shelby GT500,The 2020 Shelby GT500 is the quickest production Mustang ever.,autos,autossports,0.823414
5,N33779,The McLaren Senna is Faster 0-100-0 Than These Fast Cars Are 0-100,Putting the mighty McLaren's stupendous performance into context,autos,autosenthusiasts,0.816637
6,N12146,Is Anyone Actually Buying The New Toyota Supra?,"In case you may not be aware, Toyota brought back the Supra.",autos,autosenthusiasts,0.814645
7,N9254,Toyota Supra vs. Ford Mustang Shelby GT350: Kicking Tail!,Toyota's new sports coupe goes Head 2 Head against Ford's updated pony.,autos,autossports,0.807088
8,N90096,This Honda F1 V-10 Is the Perfect Engine for Your Dream Swap,"All yours for the low, low price of $12,200.",autos,autosenthusiasts,0.806398
9,N37923,Porsche's 718 Cayman Fits More Cargo Than the 2020 Chevrolet Corvette,"The new mid-engined Corvette holds one less carry-on suitcase, but it has space for two sets of golf clubs.",autos,autossports,0.80619


## Cleanup

In [15]:
g0.drop()
g1.drop()

In [16]:
_ = gds.run_cypher('MATCH (:RecentNews)-[r:USERS_ALSO_LIKED]->() DELETE r')