In [None]:
import os
import json
import numpy as np
import pandas as pd

from neo4j import GraphDatabase

In [None]:
DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "<MyVerySecretPassword>"
DB_NAME = "android"
DB_ENCRYPTED = False


In [None]:
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS), encrypted=DB_ENCRYPTED)

In [None]:
# Schema 
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run("CREATE CONSTRAINT IF NOT EXISTS FOR (n:App) REQUIRE (n.id) IS NODE KEY").consume()
    )
    result = session.execute_write( lambda tx: 
        tx.run("CREATE CONSTRAINT IF NOT EXISTS FOR (n:Package) REQUIRE (n.name) IS NODE KEY").consume()
    )
    result = session.execute_write( lambda tx: 
        tx.run("CREATE INDEX IF NOT EXISTS FOR (p:App) ON (p.wcc)").consume()
    )

In [None]:
df = pd.read_csv("/Users/haklof/import/Android_Permission.csv").dropna()
df.head()

In [None]:
df.App.value_counts()

In [None]:
apps = list()
packages = set()
app_installs = list()
app_links = list()
for idx, row in df.iterrows():
    app = { 
        'appId': idx,
        'name': row['App'],
        'description': row['Description'],
        'rating': row['Rating'],
        'numberOfRatings': row['Number of ratings'],
        'price': row['Price'],
        'dangerous': row['Dangerous permissions count'],
        'safe': row['Safe permissions count'],
        'rowNum': idx,
        'class': row['Class']
    }
    if row['Package'] != None:
        app_installs.append({'appId': idx, 'package': row['Package']})
    app_links.append({'appId': idx,  "relatedPackages": [ package.strip() for package in row['Related apps'][1:-1].split(',') ]})
    packages.add(row['Package'])
    [ packages.add(package.strip()) for package in row['Related apps'][1:-1].split(',') ]
    apps.append(app)        
print(app_links[0])

In [None]:
# Create Package nodes
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            UNWIND $packages as package
            MERGE (a:Package{name:package})
            """,
            packages = list(packages)
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# Create App nodes
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            UNWIND $apps as app
            MERGE (a:App{id:app.appId})
            SET a.name = app.name,
                a.description = app.description,
                a.rating = toFloat(app.rating),
                a.numberOfRatings = toInteger(app.numberOfRatings),
                a.price = toFloat(app.price),
                a.dangerous = toInteger(app.dangerous),
                a.safe = toInteger(app.safe),
                a.rowNum = app.rowNum,
                a.class = toInteger(app.class)
            """,
            apps = apps
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# Create App - INSTALLS -> Package rels
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            UNWIND $app_installs as rel
            MATCH (a:App{id: rel.appId}), (p:Package{name: rel.package})
            MERGE (a)-[:INSTALLS]->(p)
            """,
            app_installs = app_installs
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# Create App - LINKS -> Package rels
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            UNWIND $app_links as rel
            MATCH (a:App{id: rel.appId}), (p:Package)
            WHERE p.name in rel.relatedPackages
            MERGE (a)-[:LINKS]->(p)
            """,
            app_links = app_links
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# Add label for Malware
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            MATCH (a:App) where a.class = 1
            set a:Malware
            """
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# Calculate dangerousness of app
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            MATCH (a:App)
            set a.dangerousness = a.dangerous/(a.safe + a.dangerous + 0.00001)
            """
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# Is an app more safe if other appls link what it installs?
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            MATCH (a:App)-[:INSTALLS]->(p:Package)
            set a.numberOfDependants = count { (p)<-[:INSTALLS]-() }
            """
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# Create graph projection 
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run("CALL gds.graph.drop('app-projection', false)").consume()
    )
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            CALL gds.graph.project.cypher('app-projection',
            'MATCH (a:App) return id(a) as id, 
                ["App"] as labels,
                a.class as class, 
                coalesce(a.numberOfRatings, 0) as numberOfRatings, 
                coalesce(a.rating,0.0) as rating, 
                coalesce(a.price,0.0) as price,
                coalesce(a.dangerous,0) as dangerous,
                coalesce(a.safe,0) as safe,
                a.dangerousness as dangerousness,
                a.numberOfDependants as numberOfDependants',
            'MATCH (a1:App)-[:LINKS|INSTALLS]->(p:Package)<-[:LINKS|INSTALLS]->(a2:App) 
            RETURN id(a1) as source, id(a2) as target, (a1.dangerousness+a2.dangerousness)/2 as risk
            ')
            """
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# Pagerank
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            call gds.pageRank.mutate('app-projection',{
                    maxIterations: 100,
                    mutateProperty: 'pageRank'
            })    
            """
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# WCC
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            call gds.wcc.mutate('app-projection',{
                    mutateProperty: 'wcc'
            })    
            """
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# Create embedding
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            call gds.fastRP.mutate('app-projection',{
                embeddingDimension: 8, 
                relationshipWeightProperty: 'risk',
                iterationWeights: [0.0, 1.0, 0.8, 0.7],
                mutateProperty: 'embedding'
                })
                
            """
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
## Todo: Upgrade to use ml pipeline
# with driver.session(database = DB_NAME) as session:
#     try: 
#         result = session.execute_write( lambda tx: 
#             tx.run("call gds.beta.model.drop('risk-model')").consume()
#         )
#     except:
#         print("Failed to remove model")
# with driver.session(database = DB_NAME) as session:
#     result = session.execute_write( lambda tx: 
#         # Todo: 'numberOfRatings' has NaN value?
#         tx.run(
#             """
#             call gds.ml.nodeClassification.train('app-projection',{
#                 nodeLabels: ['App'],
#                 modelName: 'risk-model',
#                 featureProperties: ['embedding', 'rating', 'price', 'dangerous', 'safe', 'dangerousness', 'numberOfDependants', 'pageRank', 'wcc'], 
#                 targetProperty: 'class', 
#                 metrics: ['F1_WEIGHTED', 'ACCURACY'], 
#                 holdoutFraction: 0.7, 
#                 validationFolds: 5, 
#                 randomSeed: 3,
#                 params: [
#                     {penalty: 0, minEpochs: 10, maxEpochs: 10000, tolerance: 0.00001, patience: 5},
#                     {penalty: 0.5, minEpochs: 10, maxEpochs: 10000, tolerance: 0.00001, patience: 5},
#                     {penalty: 1.0, minEpochs: 10, maxEpochs: 10000, tolerance: 0.00001, patience: 5}
#                     ]
#             }) yield modelInfo
#             return modelInfo
#             """
#         ).data()
#     )
#     print(json.dumps(result, indent=2))

In [None]:
# Store feutures in graph
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            call gds.graph.nodeProperties.write('app-projection',['pageRank','wcc'])
            """
        ).data()
    )
    resultFrame = pd.DataFrame(result)
    display(resultFrame)

In [None]:
# Drop projection
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run("CALL gds.graph.drop('app-projection', false)").consume()
    )