# Pull Request Effect

バグ混入にプルリクエストが影響するかを調査する．

### PR と bug の検定

In [None]:
import math

import pandas as pd
from scipy.stats import chi2_contingency

# データの読み込み
df = pd.read_csv("data/pull_request_effect.csv")

# テーブルの作成
table = pd.DataFrame(
    {
        "buggy": [
            df["#cmt+pr+bi"].sum(),
            df["#cmt-pr+bi"].sum(),
        ],
        "clean": [
            df["#cmt+pr-bi"].sum(),
            df["#cmt-pr-bi"].sum(),
        ],
    },
    index=["in_pr", "not_in_pr"],
)

# カイ二乗検定
chi2, p, dof, expected = chi2_contingency(table)
cramer_v = math.sqrt(chi2 / (table.sum().sum()))

# # 表に割合列を追加
# table["buggy_ratio"] = (table["buggy"] / (table["buggy"] + table["clean"]) * 100).apply(
#     lambda x: f"({x:.2f}%)"
# )
# table["clean_ratio"] = (table["clean"] / (table["buggy"] + table["clean"]) * 100).apply(
#     lambda x: f"({x:.2f}%)"
# )
# table = table[["buggy", "buggy_ratio", "clean", "clean_ratio"]]

# 結果の表示
print("--- chi2 test ---")
print(table.to_latex(escape=True))
print(f"chi2: {chi2}")
print(f"p-value: {p}")
print(f"degree of freedom: {dof}")
print(f"Cramer's V: {cramer_v}")
print("expected:")
print(expected)

--- chi2 test ---
\begin{tabular}{lrlrl}
\toprule
 & buggy & buggy\_ratio & clean & clean\_ratio \\
\midrule
in\_pr & 4819 & (12.93\%) & 32454 & (87.07\%) \\
not\_in\_pr & 40596 & (15.91\%) & 214532 & (84.09\%) \\
\bottomrule
\end{tabular}

chi2: 220.3649786251934
p-value: 7.529614466941645e-50
degree of freedom: 1
Cramer's V: 0.027452497524518553
expected:
[[  5789.15015681  31483.84984319]
 [ 39625.84984319 215502.15015681]]
