# 🕸️ Network Analysis of Python Package Vulnerabilities

Análise de vulnerabilidades usando **Teoria de Redes**:

- **Temporal Analysis**: evolução, survival curves, vulnerabilidades ativas
- **Network Metrics**: grau, betweenness, eigenvector, modularidade
- **Dependency Analysis**: exposição direta/indireta, profundidade
- **Comparison**: General vs AI Libraries

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (14, 6)

print('✅ Setup complete')

## 📂 1. Load Data

In [None]:
# Load
vulns = pd.read_csv('outputs/top_pypi_snyk_timeline_20231101_20251101.csv')
deps = pd.read_csv('python_dependencies_edges.csv')

print(f'Vulnerabilidades: {len(vulns)}, Pacotes: {vulns["package"].nunique()}')
print(f'Dependências: {len(deps)} arestas')

# Parse dates
for col in ['first_affected_date', 'disclosed_date', 'mitigation_date']:
    if col in vulns.columns:
        vulns[col] = pd.to_datetime(vulns[col], errors='coerce')

# Severity numeric
severity_map = {'low': 1, 'moderate': 2, 'medium': 2, 'high': 3, 'critical': 4, 'unknown': 2}
vulns['severity_norm'] = vulns['severity'].str.lower().map(severity_map).fillna(2)

vulns.head()

## 🤖 2. AI Libraries

In [None]:
AI_LIBS = {
    'torch', 'tensorflow', 'keras', 'pytorch-lightning', 'lightning',
    'scikit-learn', 'xgboost', 'lightgbm', 'catboost',
    'transformers', 'sentence-transformers', 'tokenizers',
    'langchain', 'langchain-core', 'langchain-community',
    'llama-index', 'llama-index-core',
    'vllm', 'litellm', 'openai',
    'mlflow', 'wandb', 'clearml', 'sagemaker',
    'gradio', 'streamlit',
    'opencv-python', 'pillow'
}

vulns['is_ai_lib'] = vulns['package'].str.lower().isin(AI_LIBS)
print(f'AI vulnerabilities: {vulns["is_ai_lib"].sum()}')

## 🕸️ 3. Build Network

In [None]:
G = nx.DiGraph()

for _, row in deps.iterrows():
    src, tgt = str(row['source']).lower(), str(row['target']).lower()
    if src and tgt and src != 'nan' and tgt != 'nan':
        G.add_edge(src, tgt)

for pkg in vulns['package'].unique():
    if pkg and str(pkg) != 'nan':
        G.add_node(str(pkg).lower())

print(f'Nós: {G.number_of_nodes()}, Arestas: {G.number_of_edges()}')
print(f'Densidade: {nx.density(G):.6f}')

UG = G.to_undirected()
print(f'Componentes: {nx.number_connected_components(UG)}')

## 📊 4. Network Metrics

In [None]:
print('Calculando métricas...')

degree_dict = dict(G.degree())
in_degree_dict = dict(G.in_degree())
out_degree_dict = dict(G.out_degree())

print('  Betweenness...')
betweenness_dict = nx.betweenness_centrality(G)

print('  Eigenvector...')
try:
    eigenvector_dict = nx.eigenvector_centrality_numpy(UG, max_iter=1000)
except:
    eigenvector_dict = {}

print('  PageRank...')
pagerank_dict = nx.pagerank(G)

metrics_df = pd.DataFrame({
    'package': list(G.nodes()),
    'degree': [degree_dict.get(n, 0) for n in G.nodes()],
    'in_degree': [in_degree_dict.get(n, 0) for n in G.nodes()],
    'out_degree': [out_degree_dict.get(n, 0) for n in G.nodes()],
    'betweenness': [betweenness_dict.get(n, 0) for n in G.nodes()],
    'eigenvector': [eigenvector_dict.get(n, 0) for n in G.nodes()],
    'pagerank': [pagerank_dict.get(n, 0) for n in G.nodes()]
})

metrics_df['is_ai_lib'] = metrics_df['package'].isin(AI_LIBS)

print('✅ Done')
metrics_df.describe()

## 🏆 5. Top Packages

In [None]:
print('Top 15 by IN-DEGREE (most depended-upon):')
metrics_df.nlargest(15, 'in_degree')[['package', 'in_degree', 'is_ai_lib']]

In [None]:
print('Top 15 by BETWEENNESS (connectors):')
metrics_df.nlargest(15, 'betweenness')[['package', 'betweenness', 'is_ai_lib']]

## 🔗 6. Merge with Vulnerabilities

In [None]:
vulns['package_lower'] = vulns['package'].str.lower()

vulns_net = vulns.merge(
    metrics_df, 
    left_on='package_lower', 
    right_on='package', 
    how='left',
    suffixes=('', '_net')
)

net_cols = ['degree', 'in_degree', 'out_degree', 'betweenness', 'eigenvector', 'pagerank']
vulns_net[net_cols] = vulns_net[net_cols].fillna(0)

print(f'Merged: {len(vulns_net)} rows')
vulns_net[['package', 'cve', 'severity', 'degree', 'in_degree']].head(10)

## 📈 7. TEMPORAL ANALYSIS

In [None]:
vulns_net['disclosed_year'] = vulns_net['disclosed_date'].dt.year

temporal_total = vulns_net.groupby('disclosed_year').size().reset_index(name='total')

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Total por ano
ax1.bar(temporal_total['disclosed_year'], temporal_total['total'], color='steelblue', edgecolor='black')
for _, row in temporal_total.iterrows():
    ax1.text(row['disclosed_year'], row['total'] + 5, 
             f"{row['total']}\n({row['total']/temporal_total['total'].sum()*100:.0f}%)", 
             ha='center', va='bottom', fontsize=9)
ax1.set_xlabel('Year', fontweight='bold')
ax1.set_ylabel('# Vulnerabilities', fontweight='bold')
ax1.set_title('Temporal Evolution', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)

# Por severidade
pivot = vulns_net.groupby(['disclosed_year', 'severity']).size().reset_index(name='count')
pivot = pivot.pivot(index='disclosed_year', columns='severity', values='count').fillna(0)
pivot.plot(kind='bar', stacked=True, ax=ax2, color=['green', 'blue', 'orange', 'red', 'gray'])
ax2.set_xlabel('Year', fontweight='bold')
ax2.set_ylabel('# Vulnerabilities', fontweight='bold')
ax2.set_title('By Severity Over Time', fontsize=14, fontweight='bold')
ax2.legend(title='Severity', bbox_to_anchor=(1.05, 1))

plt.tight_layout()
plt.savefig('outputs/plots/temporal_evolution.png', dpi=300, bbox_inches='tight')
plt.show()

print('✅ Saved: outputs/plots/temporal_evolution.png')

## ⏱️ 8. SURVIVAL ANALYSIS

In [None]:
vulns_surv = vulns_net[
    vulns_net['time_to_fix_from_disclosure_days'].notna() & 
    (vulns_net['time_to_fix_from_disclosure_days'] >= 0)
].copy()

vulns_surv['event'] = vulns_surv['mitigation_date'].notna().astype(int)

print(f'For survival: {len(vulns_surv)}')
print(f'Fixed: {vulns_surv["event"].sum()}, Not fixed: {(vulns_surv["event"] == 0).sum()}')

fig, axes = plt.subplots(1, 2, figsize=(16, 6))
kmf = KaplanMeierFitter()

# Por severidade
ax = axes[0]
for severity in ['low', 'moderate', 'high', 'critical']:
    mask = vulns_surv['severity'].str.lower() == severity
    if mask.sum() > 0:
        kmf.fit(
            durations=vulns_surv[mask]['time_to_fix_from_disclosure_days'],
            event_observed=vulns_surv[mask]['event'],
            label=severity
        )
        kmf.plot_survival_function(ax=ax, ci_show=True)

ax.set_xlabel('Delay (days)', fontweight='bold')
ax.set_ylabel('Survival Probability', fontweight='bold')
ax.set_title('Time to Fix by Severity', fontsize=14, fontweight='bold')
ax.legend(title='Severity')
ax.grid(alpha=0.3)

# AI vs Non-AI
ax = axes[1]
for is_ai, label in [(True, 'AI'), (False, 'Non-AI')]:
    mask = vulns_surv['is_ai_lib'] == is_ai
    if mask.sum() > 0:
        kmf.fit(
            durations=vulns_surv[mask]['time_to_fix_from_disclosure_days'],
            event_observed=vulns_surv[mask]['event'],
            label=label
        )
        kmf.plot_survival_function(ax=ax, ci_show=True)

ax.set_xlabel('Delay (days)', fontweight='bold')
ax.set_ylabel('Survival Probability', fontweight='bold')
ax.set_title('AI vs Non-AI', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/plots/survival_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print('✅ Saved: outputs/plots/survival_analysis.png')

## 🚨 9. ACTIVE VULNERABILITIES

In [None]:
vulns_net['is_active'] = vulns_net['mitigation_date'].isna()
active_vulns = vulns_net[vulns_net['is_active']].copy()

print(f'Active: {len(active_vulns)} ({len(active_vulns)/len(vulns_net)*100:.1f}%)')
print(f'AI: {active_vulns["is_ai_lib"].sum()}, Non-AI: {(~active_vulns["is_ai_lib"]).sum()}')

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Pie chart
status_counts = vulns_net['is_active'].value_counts()
axes[0].pie(status_counts.values, labels=['Mitigated', 'Active'], autopct='%1.1f%%',
            colors=['lightgreen', 'lightcoral'], startangle=90)
axes[0].set_title('Active vs Mitigated', fontsize=14, fontweight='bold')

# By severity
active_sev = active_vulns['severity'].value_counts().reindex(['low', 'moderate', 'high', 'critical'], fill_value=0)
axes[1].bar(range(len(active_sev)), active_sev.values, color=['green', 'blue', 'orange', 'red'], edgecolor='black')
axes[1].set_xticks(range(len(active_sev)))
axes[1].set_xticklabels(active_sev.index, rotation=45)
axes[1].set_ylabel('# Active', fontweight='bold')
axes[1].set_title('Active by Severity', fontsize=14, fontweight='bold')

# AI vs Non-AI
active_ai = active_vulns.groupby(['is_ai_lib', 'severity']).size().unstack(fill_value=0)
active_ai.T.plot(kind='bar', stacked=True, ax=axes[2], color=['lightblue', 'lightcoral'])
axes[2].set_xlabel('Severity', fontweight='bold')
axes[2].set_ylabel('# Active', fontweight='bold')
axes[2].set_title('AI vs Non-AI', fontsize=14, fontweight='bold')
axes[2].legend(['Non-AI', 'AI'])

plt.tight_layout()
plt.savefig('outputs/plots/active_vulnerabilities.png', dpi=300, bbox_inches='tight')
plt.show()

print('✅ Saved: outputs/plots/active_vulnerabilities.png')

## 📊 10. COMPARISON: AI vs Non-AI

In [None]:
metrics_ai = metrics_df[metrics_df['is_ai_lib']]
metrics_non_ai = metrics_df[~metrics_df['is_ai_lib']]

comparison_metrics = ['degree', 'in_degree', 'betweenness', 'pagerank']
comparison_data = []

for metric in comparison_metrics:
    stat, pvalue = stats.mannwhitneyu(metrics_ai[metric], metrics_non_ai[metric], alternative='two-sided')
    comparison_data.append({
        'metric': metric,
        'ai_mean': metrics_ai[metric].mean(),
        'non_ai_mean': metrics_non_ai[metric].mean(),
        'p_value': pvalue,
        'sig': '***' if pvalue < 0.001 else '**' if pvalue < 0.01 else '*' if pvalue < 0.05 else 'ns'
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df.to_csv('outputs/summaries/network_comparison.csv', index=False)

print('Statistical Comparison:')
comparison_df

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.flatten()

for idx, metric in enumerate(comparison_metrics):
    ax = axes[idx]
    data = [metrics_non_ai[metric], metrics_ai[metric]]
    bp = ax.boxplot(data, labels=['Non-AI', 'AI'], patch_artist=True, showmeans=True)
    bp['boxes'][0].set_facecolor('lightblue')
    bp['boxes'][1].set_facecolor('lightcoral')
    
    row = comparison_df[comparison_df['metric'] == metric].iloc[0]
    ax.text(1.5, ax.get_ylim()[1] * 0.9, f"p={row['p_value']:.4f} {row['sig']}",
            ha='center', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    ax.set_ylabel(metric.replace('_', ' ').title(), fontweight='bold')
    ax.grid(axis='y', alpha=0.3)

plt.suptitle('Network Metrics: AI vs Non-AI', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('outputs/plots/network_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print('✅ Saved: outputs/plots/network_comparison.png')

## 🎓 11. SUMMARY

In [None]:
print('='*80)
print('SUMMARY')
print('='*80)

print(f'\nNETWORK:')
print(f'  Nodes: {G.number_of_nodes()}')
print(f'  Edges: {G.number_of_edges()}')
print(f'  Density: {nx.density(G):.6f}')

print(f'\nVULNERABILITIES:')
print(f'  Total: {len(vulns_net)}')
print(f'  Active: {vulns_net["is_active"].sum()} ({vulns_net["is_active"].sum()/len(vulns_net)*100:.1f}%)')

print(f'\nAI vs NON-AI:')
print(f'  AI vulns: {vulns_net["is_ai_lib"].sum()}')
print(f'  Non-AI vulns: {(~vulns_net["is_ai_lib"]).sum()}')

print(f'\nTIME-TO-FIX (median):')
print(f'  AI: {vulns_net[vulns_net["is_ai_lib"]]["time_to_fix_from_disclosure_days"].median():.1f} days')
print(f'  Non-AI: {vulns_net[~vulns_net["is_ai_lib"]]["time_to_fix_from_disclosure_days"].median():.1f} days')

print(f'\nTOP 5 MOST IMPORTANT:')
top5 = metrics_df.nlargest(5, 'in_degree')[['package', 'in_degree']]
for _, row in top5.iterrows():
    print(f'  {row["package"]}: {row["in_degree"]:.0f} dependents')

print('\n' + '='*80)
print('✅ ANALYSIS COMPLETE!')
print('='*80)