# Pull Request Effect

バグ混入にプルリクエストが影響するかを調査する．

### PR と bug の検定

In [11]:
import math

import pandas as pd
from scipy.stats import chi2_contingency

# データの読み込み
df = pd.read_csv("data/pull_request_effect.csv")

# テーブルの作成
table = pd.DataFrame(
    {
        "buggy": [
            df["#cmt-pr+bi"].sum(),
            df["#cmt+pr+bi"].sum(),
        ],
        "clean": [
            df["#cmt-pr-bi"].sum(),
            df["#cmt+pr-bi"].sum(),
        ],
    },
    index=["not_in_pr", "in_pr"],
)

# カイ二乗検定
chi2, p, dof, expected = chi2_contingency(table)
cramer_v = math.sqrt(chi2 / (table.sum().sum()))
a, b, c, d = table.values.flatten()
odds_ratio = (a * d) / (b * c)

buggy_ratio = (table["buggy"] / (table["buggy"] + table["clean"]) * 100).apply(
    lambda x: f"({x:.2f}%)"
)


# 結果の表示
print("--- chi2 test ---")
display(table)
print(table.to_latex(escape=True))
print("buggy ratio:")
print(buggy_ratio)
print("--- result ---")
print(f"chi2: {chi2}")
print(f"p-value: {p}")
print(f"degree of freedom: {dof}")
print(f"Cramer's V: {cramer_v}")
print(f"Odds Ratio: {odds_ratio}")
print("expected:")
print(expected)

--- chi2 test ---


Unnamed: 0,buggy,clean
not_in_pr,40596,214532
in_pr,4819,32454


\begin{tabular}{lrr}
\toprule
 & buggy & clean \\
\midrule
not\_in\_pr & 40596 & 214532 \\
in\_pr & 4819 & 32454 \\
\bottomrule
\end{tabular}

buggy ratio:
not_in_pr    (15.91%)
in_pr        (12.93%)
dtype: object
--- result ---
chi2: 220.36497862519337
p-value: 7.529614466941755e-50
degree of freedom: 1
Cramer's V: 0.02745249752451855
Odds Ratio: 1.2743903312169087
expected:
[[ 39625.84984319 215502.15015681]
 [  5789.15015681  31483.84984319]]


## 既存研究の検証
レビューと不具合混入の関係

In [None]:
import math

import pandas as pd
from scipy.stats import chi2_contingency

tables = [
    # AOSP
    pd.DataFrame(
        {
            "buggy": [8322, 81],
            "clean": [23439, 1521],
        },
        index=["not_reviewed", "reviewed"],
    ),
    # LibreOffice
    pd.DataFrame(
        {
            "buggy": [2547, 147],
            "clean": [43216, 5190],
        },
        index=["not_reviewed", "reviewed"],
    ),
    # Scilab
    pd.DataFrame(
        {
            "buggy": [15, 129],
            "clean": [119, 2370],
        },
        index=["not_reviewed", "reviewed"],
    ),
]

for table in tables:
    # カイ二乗検定
    chi2, p, dof, expected = chi2_contingency(table)
    cramer_v = math.sqrt((chi2 / (table.sum().sum())))
    a, b, c, d = table.values.flatten()
    odds_ratio = (a * d) / (b * c)

    print("--- chi2 test ---")
    display(table)
    print("buggy ratio:")
    print(
        (table["buggy"] / (table["buggy"] + table["clean"]) * 100).apply(
            lambda x: f"({x:.2f}%)"
        )
    )
    print("--- result ---")
    print(f"chi2: {chi2}")
    print(f"p-value: {p}")
    print(f"degree of freedom: {dof}")
    print(f"Cramer's V: {cramer_v}")
    print(f"Odds Ratio: {odds_ratio}")

--- chi2 test ---


Unnamed: 0,buggy,clean
not_reviewed,8322,23439
reviewed,81,1521


buggy ratio:
not_reviewed    (26.20%)
reviewed         (5.06%)
dtype: object
--- result ---
chi2: 360.779568221373
p-value: 1.904740104435273e-80
degree of freedom: 1
Cramer's V: 0.10398924366371357
Odds Ratio: 6.667036420780181
--- chi2 test ---


Unnamed: 0,buggy,clean
not_reviewed,2547,43216
reviewed,147,5190


buggy ratio:
not_reviewed    (5.57%)
reviewed        (2.75%)
dtype: object
--- result ---
chi2: 75.07656084113596
p-value: 4.52810185377297e-18
degree of freedom: 1
Cramer's V: 0.03833025733057503
Odds Ratio: 2.0808194621795404
--- chi2 test ---


Unnamed: 0,buggy,clean
not_reviewed,15,119
reviewed,129,2370


buggy ratio:
not_reviewed    (11.19%)
reviewed         (5.16%)
dtype: object
--- result ---
chi2: 7.82188457010165
p-value: 0.005161734448216758
degree of freedom: 1
Cramer's V: 0.054504237928664295
Odds Ratio: 2.3158100449482117
