In [2]:
import numpy as np
import numpy
from numpy.linalg import norm

import pandas as pd

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
import neo4j
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience
import os

In [5]:
file_path = 'CAPEC.csv'

In [6]:
df = pd.read_csv(file_path)

In [7]:
print(df.head())

    ID                                                      Name Abstraction  \
0    1  Accessing Functionality Not Properly Constrained by ACLs    Standard   
1   10                 Buffer Overflow via Environment Variables    Detailed   
2  100                                          Overflow Buffers    Standard   
3  101                       Server Side Include (SSI) Injection    Detailed   
4  102                                       Session Sidejacking    Detailed   

  Status  \
0  Draft   
1  Draft   
2  Draft   
3  Draft   
4  Draft   

                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [8]:
host = 'bolt://localhost:7687'
user = 'neo4j'
password = '1234abcd'

In [9]:
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")

NEO4J_AUTH = ( "neo4j",  "1234abcd")

In [10]:
driver = GraphDatabase.driver(host, auth = (user, password))

In [11]:
gds = GraphDataScience(NEO4J_URI, auth = NEO4J_AUTH)

In [12]:
cyberAttackNames = df['Name'].unique().tolist()

In [13]:
print(cyberAttackNames[:5])

['Accessing Functionality Not Properly Constrained by ACLs', 'Buffer Overflow via Environment Variables', 'Overflow Buffers', 'Server Side Include (SSI) Injection', 'Session Sidejacking']


In [14]:
attackData = df.to_dict(orient='records')

In [16]:
gds.run_cypher("""
CALL apoc.periodic.iterate(
  'MATCH (n) RETURN n',
  'DETACH DELETE n',
  {batchSize:100, parallel:false}
)
""")

BufferError: Existing exports of data: object cannot be re-sized

CREATE "CyberAttackPattern" node

In [17]:
gds.run_cypher("""
UNWIND $data AS attackData
CREATE (:CyberAttackPattern {
  name: attackData.Name,
  id: attackData.ID,
  description: attackData.Description
})
""", {'data': attackData})

Create "HAS_INDICATOR" relationships

In [18]:
gds.run_cypher("""
UNWIND $data AS attackData
MERGE (attack:CyberAttackPattern {name: attackData.Name})
WITH attack, COALESCE(attackData.Indicators, 'None') AS indicator
WHERE indicator <> '' AND NOT indicator IS NULL AND NOT indicator =~ '.*NaN.*'
MERGE (i:Indicator {type: indicator})
MERGE (attack)-[:HAS_INDICATOR]->(i)
""", {'data': attackData})

Create "HAS_PREREQUISITE" relationships

In [19]:
gds.run_cypher("""
UNWIND $data AS attackData
MERGE (attack:CyberAttackPattern {name: attackData.Name})
WITH attack, COALESCE(attackData.Prerequisites, 'None') AS prerequisites
WHERE prerequisites <> '' AND NOT prerequisites IS NULL AND NOT prerequisites =~ '.*NaN.*'
MERGE (p:Prerequisite {type: prerequisites})
MERGE (attack)-[:HAS_PREREQUISITE]->(p)
""", {'data': attackData})

Get and store the amount of times an attack pattern is a child by another

In [20]:
child = df[['ID', 'Related Attack Patterns']].copy()
child['ChildOf ID'] = child['Related Attack Patterns'].str.extract(r'ChildOf:CAPEC ID:(\d+)').astype(float)
child.drop(columns=['Related Attack Patterns'], inplace=True)
child['ChildOf ID'].fillna(0, inplace=True)
child['count'] = 0
for index, row in child.iterrows():
    child_id = int(row['ChildOf ID'])
    if child_id != 0:
        child.loc[child['ID'] == child_id, 'count'] += 1.0
child.head()

Unnamed: 0,ID,ChildOf ID,count
0,1,122.0,2
1,10,100.0,0
2,100,123.0,12
3,101,253.0,0
4,102,593.0,0


Create relationships that connect to other Cyber Attack Patterns

In [21]:
 for row in attackData:
    attack_id = row['ID']
    attack_name = row['Name']
    related_patterns = str(row['Related Attack Patterns'])
    
    child_id = None
    follows_id = None
    peer_of_id = None
    can_proceed_id = None
    
    if 'ChildOf' in related_patterns:
        child_id_start = related_patterns.find('ChildOf:CAPEC ID:') + len('ChildOf:CAPEC ID:')
        child_id_end = related_patterns.find(' ', child_id_start)
        child_id_str = related_patterns[child_id_start:child_id_end]
        child_id = int(''.join(filter(str.isdigit, child_id_str)))
        
    if 'CanFollow' in related_patterns:
        follows_id_start = related_patterns.find('CanFollow:CAPEC ID:') + len('CanFollow:CAPEC ID:')
        follows_id_end = related_patterns.find(' ', follows_id_start)
        follows_id_str = related_patterns[follows_id_start:follows_id_end]
        follows_id = int(''.join(filter(str.isdigit, follows_id_str)))
    
    if 'PeerOf' in related_patterns:
        peer_of_id_start = related_patterns.find('PeerOf:CAPEC ID:') + len('PeerOf:CAPEC ID:')
        peer_of_id_end = related_patterns.find(' ', peer_of_id_start)
        peer_of_id_str = related_patterns[peer_of_id_start:peer_of_id_end]
        peer_of_id = int(''.join(filter(str.isdigit, peer_of_id_str)))
        
    if 'CanPrecede' in related_patterns:
        can_proceed_id_start = related_patterns.find('CanPrecede:CAPEC ID:') + len('CanPrecede:CAPEC ID:')
        can_proceed_id_end = related_patterns.find(' ', can_proceed_id_start)
        can_proceed_id_str = related_patterns[can_proceed_id_start:can_proceed_id_end]
        can_proceed_id = int(''.join(filter(str.isdigit, can_proceed_id_str)))
        
    if child_id is not None:
        gds.run_cypher("""
        UNWIND $data AS row
        MATCH (a:CyberAttackPattern {id: $attack_id}), (b:CyberAttackPattern {id: $child_id})
        CREATE (a)-[c:HAS_CHILD]->(b)
        SET c.count = row.count
        """, {"attack_id": attack_id, "child_id": child_id, "data": child.to_dict('records')})

    if follows_id is not None:
        gds.run_cypher("""
        MATCH (a:CyberAttackPattern {id: $attack_id}), (b:CyberAttackPattern {id: $follows_id})
        CREATE (a)-[:FOLLOWS]->(b)
        """, {"attack_id": attack_id, "follows_id": follows_id})
        
    if peer_of_id is not None:
        gds.run_cypher("""
        MATCH (a:CyberAttackPattern {id: $attack_id}), (b:CyberAttackPattern {id: $peer_of_id})
        CREATE (a)-[:PEER_OF]->(b)
        """, {"attack_id": attack_id, "peer_of_id": peer_of_id})
           
    if can_proceed_id is not None:
        gds.run_cypher("""
        MATCH (a:CyberAttackPattern {id: $attack_id}), (b:CyberAttackPattern {id: $can_proceed_id})
        CREATE (a)-[:CAN_PROCEED]->(b)
        """, {"attack_id": attack_id, "can_proceed_id": can_proceed_id})

Create "REQUIRES_SKILL" relationships

In [22]:
for row in attackData:
    attack_id = row['ID']
    skills_required = str(row['Skills Required'])
    
    if skills_required:
        gds.run_cypher(f"""
        MERGE (a:CyberAttackPattern {{id: $attack_id}})
        MERGE (s:Skill {{name: $skill_name}})
        MERGE (a)-[:REQUIRES_SKILL]->(s)
        """, {"attack_id": attack_id, "skill_name": skills_required})

Create "CAUSES" relationships

In [23]:
for row in attackData:
    attack_id = row['ID']
    technical_impact_text = str(row['Consequences']).split('TECHNICAL IMPACT:', 1)[-1].strip()

    if technical_impact_text:
        gds.run_cypher(f"""
        MERGE (a:CyberAttackPattern {{id: $attack_id}})
        MERGE (c:Consequences {{type: $technical_impact_text}})
        MERGE (a)-[:CAUSES]->(c)
        """, {"attack_id": attack_id, "technical_impact_text": technical_impact_text})

In [24]:
def drop_graph(name):
    
    with driver.session() as session:
        
        drop_graph_query = """
        CALL gds.graph.drop('{}');
        """.format(name)
        
        session.run(drop_graph_query)

In [25]:
drop_graph('cyberAttackPatterns')

Create a graph projection

In [33]:
node_projection = ["CyberAttackPattern"]

In [34]:
relationship_projection = {"HAS_CHILD": {"properties": "count"}}

In [35]:
G, result = gds.graph.project("cyberAttackPatterns", node_projection, relationship_projection)

In [36]:
fastRP_stream_query = '''

CALL gds.fastRP.stream('cyberAttackPatterns',
  {
    embeddingDimension: 6,
    randomSeed: 42
  }
)
YIELD nodeId, embedding
RETURN nodeId, embedding
'''

In [37]:
gds.run_cypher(fastRP_stream_query)

Unnamed: 0,nodeId,embedding
0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
1,1,"[-0.5773502588272095, 0.0, 0.0, 0.5773502588272095, -0.5773502588272095, 0.0]"
2,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,3,"[0.0, 0.0, 0.4082487225532532, 0.4082487225532532, 0.8164962530136108, 0.0]"
4,4,"[0.0, -0.7071067690849304, 0.0, 0.7071067690849304, 0.0, 0.0]"
...,...,...
613,2330,"[-0.7071067690849304, 0.0, 0.0, 0.0, -0.7071067690849304, 0.0]"
614,2331,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
615,2332,"[0.0, 0.0, 0.4082487225532532, 0.4082487225532532, 0.8164962530136108, 0.0]"
616,2333,"[0.0, 0.3779664933681488, -0.3779664933681488, 0.7559259533882141, 0.0, -0.3779664933681488]"


In [38]:
pandasDF = driver.execute_query(
    fastRP_stream_query,
    database_= "neo4j",
    result_transformer_=  neo4j.Result.to_df
)

In [39]:
print(pandasDF.head())

   nodeId  \
0       0   
1       1   
2       2   
3       3   
4       4   

                                                                       embedding  
0                                                 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]  
1  [-0.5773502588272095, 0.0, 0.0, 0.5773502588272095, -0.5773502588272095, 0.0]  
2                                                 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]  
3    [0.0, 0.0, 0.4082487225532532, 0.4082487225532532, 0.8164962530136108, 0.0]  
4                  [0.0, -0.7071067690849304, 0.0, 0.7071067690849304, 0.0, 0.0]  


In [40]:
embedding = pandasDF.get("embedding")
print(embedding)

0                                                                    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
1                     [-0.5773502588272095, 0.0, 0.0, 0.5773502588272095, -0.5773502588272095, 0.0]
2                                                                    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
3                       [0.0, 0.0, 0.4082487225532532, 0.4082487225532532, 0.8164962530136108, 0.0]
4                                     [0.0, -0.7071067690849304, 0.0, 0.7071067690849304, 0.0, 0.0]
                                                   ...                                             
613                                  [-0.7071067690849304, 0.0, 0.0, 0.0, -0.7071067690849304, 0.0]
614                                                                  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
615                     [0.0, 0.0, 0.4082487225532532, 0.4082487225532532, 0.8164962530136108, 0.0]
616    [0.0, 0.3779664933681488, -0.3779664933681488, 0.7559259533882141, 0.0, -0.3779664933681488]


In [43]:
pagerank_query = '''

CALL gds.pageRank.stream('cyberAttackPatterns')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).id AS patternID, score
ORDER BY score DESC, patternID ASC

'''

In [44]:
gds.run_cypher(pagerank_query)

Unnamed: 0,patternID,score
0,224,2.808375
1,22,2.483650
2,416,2.413125
3,169,2.375951
4,248,2.362125
...,...,...
613,646,0.150000
614,647,0.150000
615,648,0.150000
616,649,0.150000


In [45]:
pagerankDF = driver.execute_query(
    pagerank_query,
    database_="neo4j",
    result_transformer_=  neo4j.Result.to_df
)

In [46]:
print(pagerankDF.head())

   patternID     score
0        224  2.808375
1         22  2.483650
2        416  2.413125
3        169  2.375951
4        248  2.362125


In [53]:
merged_df = pd.merge(pagerankDF, df, left_on='patternID', right_on='ID', how='left')

Top 5 most vulnerable attack patterns

In [60]:
print(merged_df['Name'].head(5))

0                Fingerprinting
1    Exploiting Trust in Client
2     Manipulate Human Behavior
3                  Footprinting
4             Command Injection
Name: Name, dtype: object
