## AI 1-hop severity graph and CWE influence analysis

This notebook builds:
- AI 1-hop dependency graph with nodes colored by severity (from merged timeline)
- CWE influence networks and AI-focused CWE propagation metrics

Prerequisites:
- `python_dependencies_edges.csv` in project root
- `outputs/summaries/top_pypi_snyk_timeline_merged.csv` generated by the merge script
- Optional for AI CWE: `outputs/top_pypi_snyk_timeline_20221112_20251112.csv`



In [None]:
# Setup
import os, numpy as np
import pandas as pd, networkx as nx, matplotlib.pyplot as plt
from matplotlib.patches import Patch

os.makedirs('outputs/plots', exist_ok=True)

# Helper to normalize package/node names
def normalize_pkg(name: str) -> str:
    return str(name).strip().lower().replace('_', '-')



In [None]:
# Define AI libraries (edit this list for your focus set)
AI_LIBS = [
    'torch', 'transformers', 'langchain', 'llama-index', 'openai',
    'gradio', 'fastapi', 'mlflow', 'pytorch-lightning', 'tensorflow',
]
print(f"AI_LIBS: {len(AI_LIBS)} items")



In [None]:
# AI 1-hop severity graph (AI highlighted, all nodes colored by severity)

# Force rebuild of globals if they exist (safe when rerunning)
for v in ['DG_all','UG_all','pkg_sev','color_map']:
    if v in globals():
        del globals()[v]

# 1) Build graph from dependency edges
if not os.path.exists('python_dependencies_edges.csv'):
    raise SystemExit('python_dependencies_edges.csv not found')
edges = pd.read_csv('python_dependencies_edges.csv')
edges['source'] = edges['source'].astype(str).map(normalize_pkg)
edges['target'] = edges['target'].astype(str).map(normalize_pkg)
DG_all = nx.DiGraph(); DG_all.add_edges_from(edges[['source','target']].itertuples(index=False, name=None))
UG_all = DG_all.to_undirected()

# 2) Build package -> severity map from merged timeline
pkg_sev = {}
vuln_path = 'outputs/summaries/top_pypi_snyk_timeline_merged.csv'
if not os.path.exists(vuln_path):
    # Fallback to base timeline
    vuln_path = 'outputs/top_pypi_snyk_timeline_20231101_20251101.csv'

if os.path.exists(vuln_path):
    vulns = pd.read_csv(vuln_path)
    if 'package' in vulns.columns:
        vulns['package'] = vulns['package'].astype(str).map(normalize_pkg)
    if 'severity' in vulns.columns:
        vulns['severity'] = vulns['severity'].astype(str).str.lower().fillna('unknown')
        sev_rank = {'low':1,'medium':2,'moderate':2,'high':3,'critical':4}
        vulns['sev_rank'] = vulns['severity'].map(lambda s: sev_rank.get(s,0))
        agg = vulns.groupby('package', as_index=False)['sev_rank'].max()
        inv = {v:k for k,v in sev_rank.items()}
        agg['severity_max'] = agg['sev_rank'].map(lambda r: inv.get(r,'unknown'))
        pkg_sev = dict(zip(agg['package'], agg['severity_max']))

color_map = {'critical':'#d73027','high':'#fc8d59','medium':'#fee08b','moderate':'#fee08b','low':'#91bfdb','unknown':'#bdbdbd'}

# AI nodes + 1 hop neighbors
ai_nodes = {normalize_pkg(p) for p in AI_LIBS if normalize_pkg(p) in UG_all}
ai_focus = set(ai_nodes)
for n in list(ai_nodes):
    if n in UG_all:
        ai_focus.update(UG_all.neighbors(n))

H = UG_all.subgraph(ai_focus).copy()
if H.number_of_nodes() == 0:
    print('Empty AI 1-hop graph — check AI_LIBS and names present in dependency edges')
else:
    # Largest connected component for readability
    comps = sorted(nx.connected_components(H), key=len, reverse=True)
    H = H.subgraph(comps[0]).copy()

    # Spring layout
    k = 1/np.sqrt(max(H.number_of_nodes(),1))
    pos = nx.spring_layout(H, k=k*3, iterations=450, seed=23)

    # Sizes by in-degree (influence)
    indeg = dict(DG_all.in_degree(H.nodes()))
    base = np.array([max(1, indeg.get(n,0)) for n in H.nodes()])
    p95 = np.percentile(base, 95) if np.any(base) else 1.0
    sizes = (base/p95 * 500).clip(10, 900)

    # AI highlight
    ai_set = set(ai_nodes) & set(H.nodes())
    sizes_ai  = [(max(1, indeg.get(n,0))/p95 * 900) for n in ai_set]
    sizes_ai  = np.clip(sizes_ai, 40, 1400)

    # Colors by severity
    def sev(n):
        s = str(pkg_sev.get(normalize_pkg(n), 'unknown')).lower()
        return 'medium' if s == 'moderate' else s

    # Draw
    fig, ax = plt.subplots(1,1, figsize=(18,14))
    nx.draw_networkx_edges(H, pos, ax=ax, width=0.35, alpha=0.14, edge_color='#9e9e9e')

    ctx_nodes = [n for n in H.nodes() if n not in ai_set]
    idx = {n:i for i,n in enumerate(H.nodes())}
    ctx_sizes  = [sizes[idx[n]] for n in ctx_nodes]
    ctx_colors = [color_map.get(sev(n), color_map['unknown']) for n in ctx_nodes]
    nx.draw_networkx_nodes(H, pos, nodelist=ctx_nodes, node_size=ctx_sizes, node_color=ctx_colors,
                           edgecolors='#666666', linewidths=0.25, alpha=0.95, ax=ax)

    nx.draw_networkx_nodes(H, pos, nodelist=list(ai_set), node_size=sizes_ai,
                           node_color=[color_map.get(sev(n), color_map['unknown']) for n in ai_set],
                           edgecolors='black', linewidths=0.9, alpha=0.98, ax=ax)

    # Label main AI hubs
    ai_hubs = sorted([(n, indeg.get(n,0)) for n in ai_set], key=lambda x: x[1], reverse=True)[:25]
    labels = {n:n for n,_ in ai_hubs}
    nx.draw_networkx_labels(H, pos, labels=labels, font_size=9, font_weight='bold', ax=ax)

    legend_handles = [
        Patch(color=color_map['critical'], label='critical'),
        Patch(color=color_map['high'],     label='high'),
        Patch(color=color_map['medium'],   label='medium'),
        Patch(color=color_map['low'],      label='low'),
        Patch(facecolor='white', edgecolor='black', label='AI (bold border)'),
    ]
    ax.legend(handles=legend_handles, title='Legend', frameon=False, loc='lower left')

    subtitle = f"Nodes={H.number_of_nodes()}  Edges={H.number_of_edges()}  AI nodes={len(ai_set)}"
    ax.set_title(f"AI packages — 1-hop (all dependencies colored by severity)\n{subtitle}", fontsize=14, fontweight='bold')
    ax.axis('off')
    plt.tight_layout()
    plt.savefig('outputs/plots/dependency_severity_ai_1hop_colored.png', dpi=300, bbox_inches='tight')
    plt.show()



## CWE influence network (merged timeline window)

This section builds a directed CWE co-occurrence graph from the merged timeline, filters by date, and computes centrality metrics and plots.



In [None]:
# Parameters
CWE_SOURCE_CSV = 'outputs/summaries/top_pypi_snyk_timeline_merged.csv'
DATE_START = '2023-11-01'
DATE_END   = '2025-11-01'
TOP_K_CWE  = 30
MIN_PAIR_COUNT = 3
MIN_COND_P = 0.15

import os, re, pandas as pd, numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from collections import Counter

print(f'Loading {CWE_SOURCE_CSV}')
df_src = pd.read_csv(CWE_SOURCE_CSV)
for col in ['disclosed_date','first_affected_date','mitigation_date']:
    if col in df_src.columns:
        df_src[col] = pd.to_datetime(df_src[col], errors='coerce')

mask = (df_src['disclosed_date'] >= pd.to_datetime(DATE_START)) & (df_src['disclosed_date'] <= pd.to_datetime(DATE_END))
df = df_src.loc[mask].copy()
print(f'Rows after date filter: {len(df)}')

def parse_cwes(val):
    if pd.isna(val):
        return []
    parts = re.split(r'[;\,\s]+', str(val))
    out = []
    for p in parts:
        p = p.strip()
        if not p: continue
        up = p.upper()
        if up.startswith('CWE-'): out.append(up)
    return list(dict.fromkeys(out))

if 'cwes' in df.columns:
    df['cwe_list'] = df['cwes'].apply(parse_cwes)
elif 'CWE' in df.columns:
    df['cwe_list'] = df['CWE'].apply(parse_cwes)
else:
    df['cwe_list'] = [[] for _ in range(len(df))]

all_cwes = [c for lst in df['cwe_list'] for c in lst]
freq = Counter(all_cwes)
top_cwes = [c for c,_ in freq.most_common(TOP_K_CWE)]
print(f'Top-K CWE size: {len(top_cwes)}')

pair_count = Counter()
co_rows = 0
for lst in df['cwe_list']:
    lst = [c for c in lst if c in top_cwes]
    if len(lst) >= 2:
        co_rows += 1
        seen = sorted(set(lst))
        for i in range(len(seen)):
            for j in range(i+1, len(seen)):
                a, b = seen[i], seen[j]
                pair_count[(a,b)] += 1
                pair_count[(b,a)] += 1
print(f'Rows with >=2 CWEs: {co_rows}')

G_cwe = nx.DiGraph()
for c in top_cwes:
    G_cwe.add_node(c, freq=freq[c])

for (a,b), cnt in pair_count.items():
    if freq[a] > 0:
        p = cnt / freq[a]
        if cnt >= MIN_PAIR_COUNT and p >= MIN_COND_P:
            G_cwe.add_edge(a, b, weight=p, count=cnt)

print(f'CWE graph: nodes={G_cwe.number_of_nodes()} edges={G_cwe.number_of_edges()}')
os.makedirs('outputs/summaries', exist_ok=True)

# Centralities and communities
try:
    pr = nx.pagerank(G_cwe, weight='weight')
except Exception:
    pr = {n:0 for n in G_cwe.nodes()}
out_w = {n: sum(d.get('weight',1.0) for _,_,d in G_cwe.out_edges(n, data=True)) for n in G_cwe.nodes()}
in_w  = {n: sum(d.get('weight',1.0) for _,_,d in G_cwe.in_edges(n, data=True)) for n in G_cwe.nodes()}
UG = G_cwe.to_undirected()
try:
    btw = nx.betweenness_centrality(UG, weight='weight', normalized=True)
except Exception:
    btw = {n:0 for n in G_cwe.nodes()}
from networkx.algorithms import community
comms = list(community.greedy_modularity_communities(UG, weight='weight')) if UG.number_of_edges()>0 else []
node2comm = {}
for i, comm in enumerate(comms):
    for n in comm:
        node2comm[n] = i
rows = []
for n in G_cwe.nodes():
    rows.append({'cwe':n,'freq':G_cwe.nodes[n].get('freq',0),'pagerank':pr.get(n,0),
                 'out_strength':out_w.get(n,0),'in_strength':in_w.get(n,0),'betweenness':btw.get(n,0),
                 'community':node2comm.get(n,-1)})
metrics_df_cwe = pd.DataFrame(rows).sort_values(['pagerank','out_strength'], ascending=False)
metrics_df_cwe.to_csv('outputs/summaries/cwe_metrics.csv', index=False)
# reach
reach = {}
for n in G_cwe.nodes():
    seen=set(); stack=[n]
    while stack:
        u=stack.pop()
        for v in G_cwe.successors(u):
            if v not in seen:
                seen.add(v); stack.append(v)
    reach[n]=len(seen)
metrics_df_cwe['reach_nodes']=metrics_df_cwe['cwe'].map(reach)
metrics_df_cwe.to_csv('outputs/summaries/cwe_metrics.csv', index=False)
print(metrics_df_cwe.head())

# Plots (network, heatmap, curves)
pos = nx.spring_layout(G_cwe, weight='weight', seed=42, k=2) if G_cwe.number_of_edges()>0 else {n:(i,0) for i,n in enumerate(G_cwe.nodes())}
num_c = (max(node2comm.values())+1) if node2comm else 1
cmap = plt.cm.tab20(np.linspace(0,1,max(1,num_c)))
node_color=[cmap[node2comm.get(n,0)] for n in G_cwe.nodes()]
node_size=[1000*(metrics_df_cwe.set_index('cwe').loc[n,'pagerank']+1e-4) for n in G_cwe.nodes()]
edge_w=[d.get('weight',0.1)*3 for _,_,d in G_cwe.edges(data=True)]
fig, ax = plt.subplots(1,1, figsize=(18,12))
nx.draw_networkx_edges(G_cwe,pos,alpha=0.25,width=edge_w,arrows=True,arrowstyle='-|>',arrowsize=10,edge_color='gray',ax=ax)
nx.draw_networkx_nodes(G_cwe,pos,node_color=node_color,node_size=node_size,edgecolors='black',linewidths=1.2,ax=ax)
labels={row['cwe']:row['cwe'] for _,row in metrics_df_cwe.head(12).iterrows()}
nx.draw_networkx_labels(G_cwe,pos,labels=labels,font_size=10,font_weight='bold',ax=ax)
ax.set_title('CWE Influence Network (directed, weight=P(B|A))',fontsize=16,fontweight='bold'); ax.axis('off')
plt.tight_layout(); plt.savefig('outputs/plots/cwe_influence_network.png',dpi=300,bbox_inches='tight'); plt.show()
# Heatmap top 15
N=min(15,len(metrics_df_cwe)); order=metrics_df_cwe.head(N)['cwe'].tolist();
cond=np.zeros((N,N))
for i,a in enumerate(order):
    for j,b in enumerate(order):
        if G_cwe.has_edge(a,b): cond[i,j]=G_cwe.edges[a,b].get('weight',0)
fig, ax = plt.subplots(1,1, figsize=(12,10)); sns.heatmap(cond,xticklabels=order,yticklabels=order,cmap='YlOrRd',cbar_kws={'label':'P(B|A)'},ax=ax); plt.xticks(rotation=45,ha='right'); ax.set_title('CWE conditional probability (top 15)',fontsize=14,fontweight='bold'); plt.tight_layout(); plt.savefig('outputs/plots/cwe_conditional_heatmap.png',dpi=300,bbox_inches='tight'); plt.show()
# Curves
vals_out=np.sort(metrics_df_cwe['out_strength'].values)[::-1]; vals_in=np.sort(metrics_df_cwe['in_strength'].values)[::-1]
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(16,6)); ax1.scatter(range(len(vals_out)),vals_out,c='#e74c3c',s=30); ax1.set_title('CWE influence (sorted out-strength)'); ax1.grid(alpha=0.3); ax2.scatter(range(len(vals_in)),vals_in,c='#7ea9e1',s=30); ax2.set_title('CWE dependency (sorted in-strength)'); ax2.grid(alpha=0.3); plt.tight_layout(); plt.savefig('outputs/plots/cwe_influence_dependency_curves.png',dpi=300,bbox_inches='tight'); plt.show()



## AI-focused CWE propagation and P-Impact

This section focuses on AI libraries only and computes CWE influence and dependency reach to derive P-Impact.



In [None]:
# Inputs
SRC_TIMELINE = 'outputs/top_pypi_snyk_timeline_20221112_20251112.csv'
DEPS_CSV    = 'python_dependencies_edges.csv'
DATE_START  = '2023-11-01'
DATE_END    = '2025-11-01'
TOP_K_CWE   = 30
MIN_PAIR    = 3
MIN_P       = 0.10

import pandas as pd, numpy as np, re, networkx as nx, matplotlib.pyplot as plt, seaborn as sns
from collections import Counter
from itertools import combinations

# Load & filter timeline
if not os.path.exists(SRC_TIMELINE):
    raise SystemExit(f'{SRC_TIMELINE} not found')
if not os.path.exists(DEPS_CSV):
    raise SystemExit(f'{DEPS_CSV} not found')

df = pd.read_csv(SRC_TIMELINE)
for col in ['disclosed_date','first_affected_date','mitigation_date']:
    if col in df.columns: df[col] = pd.to_datetime(df[col], errors='coerce')
mask = (df['disclosed_date']>=pd.to_datetime(DATE_START)) & (df['disclosed_date']<=pd.to_datetime(DATE_END))
df = df.loc[mask].copy()
df['package_lower'] = df['package'].astype(str).str.lower()

# CWE parsing
def parse_cwes(val):
    if pd.isna(val): return []
    out=[]
    for p in re.split(r'[;\,\s]+', str(val)):
        up=p.strip().upper()
        if up.startswith('CWE-'): out.append(up)
    return list(dict.fromkeys(out))

cwe_col = 'cwes' if 'cwes' in df.columns else ('CWE' if 'CWE' in df.columns else None)
df['cwe_list'] = df[cwe_col].apply(parse_cwes) if cwe_col else [[] for _ in range(len(df))]

# Build CWE influence graph (directed co-occur)
all_cwes=[c for lst in df['cwe_list'] for c in lst]
freq=Counter(all_cwes)
top_cwes=[c for c,_ in freq.most_common(TOP_K_CWE)]
pair=Counter()
for lst in df['cwe_list']:
    lst=[c for c in lst if c in top_cwes]
    if len(lst)>=2:
        seen=sorted(set(lst))
        for i in range(len(seen)):
            for j in range(i+1,len(seen)):
                a,b=seen[i],seen[j]; pair[(a,b)]+=1; pair[(b,a)]+=1
G_cwe=nx.DiGraph()
for c in top_cwes: G_cwe.add_node(c, freq=freq[c])
for (a,b),cnt in pair.items():
    if freq[a]>0:
        p=cnt/freq[a]
        if cnt>=MIN_PAIR and p>=MIN_P: G_cwe.add_edge(a,b,weight=p,count=cnt)
try: pr = nx.pagerank(G_cwe, weight='weight')
except: pr={n:0 for n in G_cwe.nodes()}
# normalize PR to [0,1]
if pr: 
    arr=np.array(list(pr.values())); mn, mx = arr.min(), arr.max();
    pr_norm={k: (v-mn)/(mx-mn+1e-12) for k,v in pr.items()}
else:
    pr_norm={n:0 for n in G_cwe.nodes()}

# AI libs only rows
ai_set = {p.lower() for p in AI_LIBS}
ai_df = df[df['package_lower'].isin(ai_set)].copy()
# per-package CWE influence score (sum PR of its CWEs)
def influence_for(lst): return float(np.sum([pr_norm.get(c,0) for c in lst]))
pkg_infl = (ai_df.groupby('package_lower')['cwe_list']
            .apply(lambda col: influence_for([c for lst in col for c in lst]))
            .reset_index().rename(columns={'cwe_list':'cwe_influence'}))

# Dependency graph and reverse reach to dependents
deps = pd.read_csv(DEPS_CSV)
deps['source']=deps['source'].astype(str).str.lower(); deps['target']=deps['target'].astype(str).str.lower()
DG = nx.DiGraph(); DG.add_edges_from(deps[['source','target']].itertuples(index=False,name=None))
RG = DG.reverse(copy=True)  # edges: lib -> dependents

# reach per AI package
reach_rows=[]
from collections import deque
for pkg in sorted(set(pkg_infl['package_lower'])):
    if pkg not in RG: reach_rows.append((pkg,0,0.0)); continue
    seen=set([pkg]); q=deque([(pkg,0)]); total=0; dsum=0
    while q:
        u,d=q.popleft()
        for v in RG.successors(u):
            if v not in seen:
                seen.add(v); q.append((v,d+1)); total+=1; dsum+=d+1
    avg_depth = (dsum/total) if total>0 else 0.0
    reach_rows.append((pkg,total,avg_depth))
reach_df = pd.DataFrame(reach_rows, columns=['package_lower','dependent_reach','avg_depth'])

# Merge and compute P-Impact
out = pkg_infl.merge(reach_df, on='package_lower', how='left')
out['dependent_reach']=out['dependent_reach'].fillna(0)
# normalize reach to [0,1]
if len(out)>0:
    r=out['dependent_reach'].astype(float); out['reach_norm']=(r-r.min())/(r.max()-r.min()+1e-12)
else:
    out['reach_norm']=0.0
out['p_impact'] = 0.5*out['cwe_influence'] + 0.5*out['reach_norm']
out = out.sort_values('p_impact', ascending=False)
out['package']=out['package_lower']
os.makedirs('outputs/summaries', exist_ok=True)
out[['package','cwe_influence','dependent_reach','avg_depth','p_impact']].to_csv('outputs/summaries/ai_cwe_impact.csv', index=False)
print('Saved: outputs/summaries/ai_cwe_impact.csv')

# Plots
top = out.head(20)
fig, ax = plt.subplots(1,1, figsize=(14,7))
sns.barplot(data=top, x='p_impact', y='package', ax=ax, color='#8ecae6')
ax.set_title('AI libs — P-Impact (CWE influence × dependency reach)', fontsize=14, fontweight='bold')
ax.set_xlabel('P-Impact'); ax.set_ylabel('package')
plt.tight_layout(); plt.savefig('outputs/plots/ai_pimpact_ranking.png', dpi=300, bbox_inches='tight'); plt.show()

# Scatter reach vs influence
fig, ax = plt.subplots(1,1, figsize=(8,6))
sns.scatterplot(data=out, x='cwe_influence', y='dependent_reach', ax=ax, color='#e76f51')
for _,row in top.iterrows():
    ax.annotate(row['package'], (row['cwe_influence'], row['dependent_reach']), xytext=(5,5), textcoords='offset points', fontsize=8)
ax.set_xlabel('CWE influence (sum PR)'); ax.set_ylabel('# dependents (reach)')
ax.set_title('AI libs — Influence vs Reach', fontsize=12, fontweight='bold')
plt.tight_layout(); plt.savefig('outputs/plots/ai_influence_vs_reach.png', dpi=300, bbox_inches='tight'); plt.show()

# Print short answers
print('Top 10 AI libs to fix (by P-Impact):')
print(out.head(10)[['package','p_impact','dependent_reach','avg_depth','cwe_influence']])



In [None]:
# Additional AI CWE plots: SPOF scatter and out-degree ranking
UG = G_cwe.to_undirected()
try:
    btw = nx.betweenness_centrality(UG, weight='weight', normalized=True)
except Exception:
    btw = {n:0 for n in G_cwe.nodes()}
deg_cent = nx.degree_centrality(UG) if UG.number_of_nodes()>0 else {n:0 for n in G_cwe.nodes()}
plot_df = []
for n in G_cwe.nodes(): plot_df.append({'cwe':n,'betweenness':btw.get(n,0.0),'degree_centrality':deg_cent.get(n,0.0)})
plot_df = pd.DataFrame(plot_df).sort_values('degree_centrality', ascending=False)

# SPOF scatter
fig, ax = plt.subplots(1,1, figsize=(12,6))
sns.scatterplot(data=plot_df, x='betweenness', y='degree_centrality', color='#ee6c4d', ax=ax)
for _,row in plot_df.head(12).iterrows():
    ax.annotate(row['cwe'], (row['betweenness'], row['degree_centrality']), xytext=(5,5), textcoords='offset points', fontsize=8)
ax.set_title('AI CWEs — Single point of failure risk', fontsize=13, fontweight='bold')
ax.set_xlabel('Betweenness Centrality'); ax.set_ylabel('Degree Centrality')
plt.tight_layout(); plt.savefig('outputs/plots/ai_cwe_spof_scatter.png', dpi=300, bbox_inches='tight'); plt.show()

# Out-degree ranking curve
out_deg = [(n, G_cwe.out_degree(n)) for n in G_cwe.nodes()]
out_deg = sorted(out_deg, key=lambda x: x[1], reverse=True)
x = list(range(1, len(out_deg)+1)); y = [v for _,v in out_deg]
fig, ax = plt.subplots(1,1, figsize=(14,6))
ax.scatter(x, y, color='#e76f51')
for i,(n,v) in enumerate(out_deg[:12]):
    ax.annotate(n, (i+1, v), xytext=(5,5), textcoords='offset points', fontsize=8)
ax.set_title('AI CWEs most vulnerable (out-degree)', fontsize=13, fontweight='bold')
ax.set_xlabel('CWE (sorted by out-degree)'); ax.set_ylabel('Out-degree')
plt.tight_layout(); plt.savefig('outputs/plots/ai_cwe_outdegree_curve.png', dpi=300, bbox_inches='tight'); plt.show()

