# Data Extraction


In [1]:
import re
import os
from typing import Dict, List

import pandas as pd
import numpy as np
from dotenv import load_dotenv
from github import Github, Commit, Repository

load_dotenv()

GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")

## Extract Commits from Git


### Local Git Repository


In [None]:
...

### GitHub Repository

Create a file named `github_commits.csv` in the `data/sources` directory.
Refer to the following:

- Columns: **remote, hash, label, ...metadata_columns**
  - remote: `str`
  - hash: `str`
  - label: `int`
    - 0: not a vulnerability fix
    - 1: vulnerability fix
  - ...metadata_columns: `any`
    - any other columns will be added to the output in the metadata field

**Example**

```csv
# github_commits.csv
remote,hash,label,cve
https://github.com/torvalds/linux.git,9d2231c5d74e13b2a0546fee6737ee4446017903,1,CVE-2022-0847
```

```
input:
remote,hash,label,...metadata_columns

output:
message,label,metadata
```


#### Extract Commit Message from Hashes
- This will use up your Git limits (5000 requests / hr)

In [3]:
df = pd.read_csv("data/sources/github_commits.csv")

repos = {}


def get_repo_name(remote: str) -> str:
    return re.sub(r".*github.com/", "", remote).split(".git")[0]


def get_repo(repo_name: str) -> Repository.Repository:
    if repo_name in repos:
        return repos[repo_name]
    g = Github(GITHUB_ACCESS_TOKEN)
    repo = g.get_repo(repo_name)
    repos[repo_name] = repo
    return repo


def get_commit(row) -> Commit.Commit:
    repo = row["repo"]
    commit_hash = row["hash"]
    return repo.get_commit(commit_hash)


df["repo_name"] = df["remote"].apply(get_repo_name)
df["repo"] = df["repo_name"].apply(get_repo)
df["commit"] = df.apply(get_commit, axis=1)
df["message"] = df["commit"].apply(lambda c: c.commit.message)

In [4]:
output_df = df.drop(columns=["repo", "commit", "repo_name"])

output_columns = ["message", "label"]
columns_to_drop = [col for col in output_df.columns if col not in output_columns]


output_df["metadata"] = output_df[columns_to_drop].apply(
    lambda x: dict(zip(columns_to_drop, x)), axis=1
)


output_df = output_df.drop(columns=columns_to_drop)

output_df.to_csv("data/processed/commits.csv", index=False)