# Data Collection for Package Risk Analysis

This notebook collects package metadata from PyPI for training ML models.

## Goals
1. Collect metadata for popular Python packages
2. Collect data on known malicious packages
3. Create training dataset for package risk scoring

In [21]:
# Install required packages
!pip install httpx pandas tqdm




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
import httpx
import pandas as pd
from datetime import datetime
from pathlib import Path
from tqdm import tqdm
import time
import json

## 1. Define Package Lists

We need:
- **Popular packages**: Top PyPI packages (legitimate)
- **Malicious packages**: Known bad packages from security reports

In [23]:
# Top popular PyPI packages (legitimate examples)
POPULAR_PACKAGES = [
    "requests", "numpy", "pandas", "matplotlib", "scipy",
    "django", "flask", "tensorflow", "torch", "keras",
    "pillow", "sqlalchemy", "beautifulsoup4", "selenium", "pytest",
    "boto3", "pyyaml", "redis", "celery", "cryptography",
    "httpx", "fastapi", "pydantic", "black", "mypy",
    "pylint", "setuptools", "pip", "wheel", "virtualenv",
    "tqdm", "click", "colorama", "rich", "typer",
    "poetry", "jupyter", "notebook", "ipython", "scikit-learn",
    "xgboost", "lightgbm", "opencv-python", "transformers", "huggingface-hub",
    "aiohttp", "uvicorn", "gunicorn", "werkzeug", "jinja2",
]

# Known malicious package patterns (for training - these are examples of typosquats)
# In reality, you would get this from security databases
KNOWN_TYPOSQUATS = [
    # Format: (malicious_name, target_name)
    ("python-requests", "requests"),
    ("python3-requests", "requests"),
    ("reqeusts", "requests"),
    ("request", "requests"),
    ("python-numpy", "numpy"),
    ("numppy", "numpy"),
    ("python-pandas", "pandas"),
    ("panads", "pandas"),
]

## 2. PyPI API Functions

In [24]:
def fetch_package_metadata(package_name: str) -> dict | None:
    """
    Fetch package metadata from PyPI API.
    
    Returns metadata dict or None if package not found.
    """
    url = f"https://pypi.org/pypi/{package_name}/json"
    try:
        with httpx.Client(timeout=10.0) as client:
            response = client.get(url)
            if response.status_code == 200:
                return response.json()
            return None
    except Exception as e:
        print(f"Error fetching {package_name}: {e}")
        return None


def extract_features(metadata: dict) -> dict:
    """
    Extract ML features from package metadata.
    """
    info = metadata.get("info", {})
    releases = metadata.get("releases", {})
    
    # Calculate days since creation (use first release date)
    first_release_date = None
    for version, files in releases.items():
        if files:
            upload_time = files[0].get("upload_time")
            if upload_time:
                if first_release_date is None or upload_time < first_release_date:
                    first_release_date = upload_time
    
    days_since_creation = 0
    if first_release_date:
        try:
            first_date = datetime.fromisoformat(first_release_date.replace("Z", "+00:00"))
            days_since_creation = (datetime.now(first_date.tzinfo) - first_date).days
        except:
            pass
    
    return {
        "name": info.get("name", ""),
        "version": info.get("version", ""),
        "days_since_creation": days_since_creation,
        "version_count": len(releases),
        "has_homepage": bool(info.get("home_page")),
        "has_repository": bool(info.get("project_urls", {}).get("Repository") or 
                                info.get("project_urls", {}).get("Source")),
        "description_length": len(info.get("description", "") or ""),
        "summary_length": len(info.get("summary", "") or ""),
        "author": info.get("author", ""),
        "license": info.get("license", ""),
        "requires_python": info.get("requires_python", ""),
        "classifiers_count": len(info.get("classifiers", [])),
    }

## 3. Collect Package Data

In [25]:
def collect_package_data(package_list: list, label: int = 0) -> pd.DataFrame:
    """
    Collect metadata for a list of packages.
    
    Args:
        package_list: List of package names
        label: 0 for legitimate, 1 for suspicious
    """
    data = []
    
    for package in tqdm(package_list, desc="Fetching packages"):
        metadata = fetch_package_metadata(package)
        if metadata:
            features = extract_features(metadata)
            features["is_malicious"] = label
            data.append(features)
        time.sleep(0.5)  # Rate limiting
    
    return pd.DataFrame(data)

In [26]:
# Collect data for popular (legitimate) packages
print("Collecting popular package data...")
popular_df = collect_package_data(POPULAR_PACKAGES, label=0)
print(f"Collected {len(popular_df)} legitimate packages")
popular_df.head()

Collecting popular package data...


Fetching packages: 100%|███████████████████████████████████████████████████████████████| 50/50 [02:10<00:00,  2.60s/it]

Collected 50 legitimate packages





Unnamed: 0,name,version,days_since_creation,version_count,has_homepage,has_repository,description_length,summary_length,author,license,requires_python,classifiers_count,is_malicious
0,requests,2.32.5,5452,157,True,True,2927,23,Kenneth Reitz,Apache-2.0,>=3.9,19,0
1,numpy,2.4.1,6987,141,False,False,4199,49,Travis E. Oliphant et al.,,>=3.11,19,0
2,pandas,2.3.3,5867,111,False,False,11533,71,,BSD 3-Clause License\n \n Copy...,>=3.9,16,0
3,matplotlib,3.10.8,7314,137,False,False,3741,23,"John D. Hunter, Michael Droettboom",License agreement for matplotlib versions 1.3....,>=3.10,13,0
4,scipy,1.17.0,5654,103,False,False,3720,57,,"Copyright (c) 2001-2002 Enthought, Inc. 2003, ...",>=3.11,18,0


In [27]:
# For typosquat packages, we'll create synthetic data
# since most are removed from PyPI
# In practice, you would collect real malicious package data from security reports

def create_synthetic_malicious_data(n_samples: int = 50) -> pd.DataFrame:
    """
    Create synthetic malicious package data for training.
    Based on common patterns observed in real attacks.
    """
    import random
    
    data = []
    for i in range(n_samples):
        data.append({
            "name": f"fake-package-{i}",
            "version": "0.1.0",
            "days_since_creation": random.randint(1, 30),  # Very new
            "version_count": random.randint(1, 3),  # Few versions
            "has_homepage": random.random() < 0.2,  # Usually no homepage
            "has_repository": random.random() < 0.1,  # Usually no repo
            "description_length": random.randint(0, 100),  # Short descriptions
            "summary_length": random.randint(0, 50),
            "author": "unknown",
            "license": "",
            "requires_python": "",
            "classifiers_count": random.randint(0, 2),
            "is_malicious": 1,
        })
    
    return pd.DataFrame(data)

malicious_df = create_synthetic_malicious_data(50)
print(f"Created {len(malicious_df)} synthetic malicious samples")
malicious_df.head()

Created 50 synthetic malicious samples


Unnamed: 0,name,version,days_since_creation,version_count,has_homepage,has_repository,description_length,summary_length,author,license,requires_python,classifiers_count,is_malicious
0,fake-package-0,0.1.0,15,2,False,False,27,46,unknown,,,1,1
1,fake-package-1,0.1.0,18,1,True,False,58,15,unknown,,,0,1
2,fake-package-2,0.1.0,12,3,False,False,84,17,unknown,,,1,1
3,fake-package-3,0.1.0,18,1,False,False,48,22,unknown,,,0,1
4,fake-package-4,0.1.0,8,2,False,False,70,28,unknown,,,0,1


## 4. Combine and Save Dataset

In [28]:
# Combine datasets
full_dataset = pd.concat([popular_df, malicious_df], ignore_index=True)
print(f"Total dataset size: {len(full_dataset)}")
print(f"\nLabel distribution:")
print(full_dataset["is_malicious"].value_counts())

Total dataset size: 100

Label distribution:
is_malicious
0    50
1    50
Name: count, dtype: int64


In [29]:
# Save to CSV
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

full_dataset.to_csv(data_dir / "pypi_packages.csv", index=False)
print(f"Saved dataset to {data_dir / 'pypi_packages.csv'}")

Saved dataset to data\pypi_packages.csv


In [30]:
# Quick data exploration
print("\nDataset Statistics:")
print(full_dataset.describe())


Dataset Statistics:
       days_since_creation  version_count  description_length  summary_length  \
count           100.000000     100.000000          100.000000      100.000000   
mean           2390.230000      95.240000         5495.510000       36.020000   
std            2630.136379     216.022311        13999.186073       22.218829   
min               1.000000       1.000000            0.000000        0.000000   
25%              15.000000       2.000000           50.750000       21.000000   
50%             941.000000       3.500000          263.500000       32.500000   
75%            4874.750000     137.750000         5261.000000       46.250000   
max            7314.000000    1956.000000        88210.000000       99.000000   

       classifiers_count  is_malicious  
count          100.00000    100.000000  
mean             8.29000      0.500000  
std              9.07243      0.502519  
min              0.00000      0.000000  
25%              1.00000      0.000000  
50%

## Next Steps

1. Open `02_feature_engineering.ipynb` to process and transform features
2. Open `03_model_training.ipynb` to train the ML models