In [None]:
"""
import libs and create statistics dataset and helper methods
"""

import pandas as pd
from matplotlib import pyplot as plt
from pandas import DataFrame
from scipy import stats
from scipy.stats._stats_py import SignificanceResult
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

statistics = DataFrame(
    columns=["project", "kendal_v", "kendal_p", "spearman_v", "spearman_p", "y_column", "x_column"]
)


def add_statistics_row(project, kendal_v, kendal_p, spearman_v, spearman_p, y_column, x_column):
    global statistics

    row = DataFrame({
        "project": [project],
        "kendal_v": [kendal_v],
        "kendal_p": [kendal_p],
        "spearman_p": [spearman_p],
        "spearman_v": [spearman_v],
        "y_column": [y_column],
        "x_column": [x_column]
    })
    statistics = pd.concat([statistics, row], ignore_index=True)


def load_masterdata():
    """
    loads the master dataset and converts the timestamp into a pandas interpretable datetime
    :return: the dataframe, a list of all projects from the dataframe
    """

    df = pd.read_csv("data/master-dataset.csv")
    df["total_cumsum"] = df["total"].cumsum()
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    return df, sorted(df["project"].unique())


def regression_and_stats(df: DataFrame, project: str, y_axis_label: str, plot_path: str, x_column_name: str,
                         y_column_names: list[str], show_legend=False):
    project_df = df[df["project"] == project]

    plt.xlabel("number of satds in code")
    plt.ylabel(y_axis_label)
    plt.title(project)

    for y_column_name in y_column_names:
        kendal: SignificanceResult = stats.kendalltau(project_df[y_column_name], project_df[x_column_name])
        spearman: SignificanceResult = stats.spearmanr(project_df[y_column_name], project_df[x_column_name])
        add_statistics_row(project, *kendal, *spearman, y_column_name, x_column_name)

        poly = PolynomialFeatures(degree=3)
        x_poly = poly.fit_transform(project_df[y_column_name].sort_values().to_numpy().reshape(-1, 1))
        y = project_df[x_column_name].to_numpy().reshape(-1, 1)
        model = LinearRegression()
        model.fit(x_poly, y)
        predicted = model.predict(x_poly)
        print("MSE", mean_squared_error(y, predicted))

        plt.scatter(project_df[y_column_name], project_df[x_column_name], s=3, label=y_column_name)
        plt.plot(project_df[y_column_name].sort_values(), predicted, color="red")

    if show_legend:
        plt.legend()
    plt.savefig(f"{plot_path}/{project}.png")
    plt.show()




# RQ1

In [None]:
"""
dependence on commit frequency
"""

df, projects = load_masterdata()

for project in projects:
    regression_and_stats(df=df,
                         project=project,
                         y_axis_label="number of commits per 2 weeks",
                         plot_path="plots/rq1/commit_frequency",
                         x_column_name="commits_per_14days",
                         y_column_names=["total_cumsum"])
statistics

In [None]:
"""
dependence on release speed
"""

df, projects = load_masterdata()
df.dropna(inplace=True)

for project in projects:
    regression_and_stats(df=df,
                         project=project,
                         y_axis_label="number of releases per 60 days",
                         plot_path="plots/rq1/release_frequency",
                         x_column_name="releases_per_60days",
                         y_column_names=["total_cumsum"])
statistics

In [None]:
"""
dependence on exact number of open pull requests
"""

df, projects = load_masterdata()
df.dropna(inplace=True)

for project in projects:
    regression_and_stats(df=df,
                         project=project,
                         y_axis_label="number of open pull requests",
                         plot_path="plots/rq1/prs",
                         x_column_name="open_prs_exact",
                         y_column_names=["total_cumsum"])
statistics


In [None]:
# """
# dependence on number of opened pull requests in a kernel of 7 days
# """
# 
# df, projects = load_masterdata()
# df = df.dropna()
# 
# for project in projects:
#     regression_and_stats(df, project, "open_prs_per7", "opened pull requests in a kernel of 7 days", "plots/prs-per7")


# RQ2

In [None]:
"""
prepare masterdataset for rq2
"""

df, projects = load_masterdata()

df["code|design-debt"] = df["code|design-debt"].cumsum()
df["documentation-debt"] = df["documentation-debt"].cumsum()
df["test-debt"] = df["test-debt"].cumsum()
df["requirement-debt"] = df["requirement-debt"].cumsum()

df.dropna(inplace=True)

In [None]:
"""
dependence of exact number of open pull requests on every type of satd
"""

for project in projects:
    regression_and_stats(df=df,
                         project=project,
                         y_axis_label="number of open pull requests",
                         plot_path="plots/rq2/prs",
                         x_column_name="open_prs_exact",
                         y_column_names=["code|design-debt", "test-debt", "requirement-debt", "documentation-debt"])
statistics

In [None]:
"""
dependence of commit frequency on every type of satd
"""

for project in projects:
    regression_and_stats(df=df,
                         project=project,
                         y_axis_label="number of commits per 14 days",
                         plot_path="plots/rq2/commit_frequency",
                         x_column_name="commits_per_14days",
                         y_column_names=["code|design-debt", "test-debt", "requirement-debt", "documentation-debt"])
statistics

In [None]:
"""
dependence of release frequency on every type of satd
"""

for project in projects:
    regression_and_stats(df=df,
                         project=project,
                         y_axis_label="number of releases per 60 days",
                         plot_path="plots/rq2/release_frequency",
                         x_column_name="releases_per_60days",
                         y_column_names=["code|design-debt", "test-debt", "requirement-debt", "documentation-debt"])
statistics

In [None]:
"""
Execute all cells sequentially before running this cell
It stores the kendal and spearman values all together in a statistics.csv file
"""

statistics.to_csv("data/statistics.csv")