In [None]:
import csv

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pandas import DataFrame
from scipy import stats
from scipy.stats._stats_py import SignificanceResult
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures


def load_masterdata():
    """
    loads the master dataset and converts the timestamp into a pandas interpretable datetime
    :return: the dataframe, a list of all projects from the dataframe
    """
    df = pd.read_csv("data/master-dataset.csv")
    df["total_cumsum"] = df["total"].cumsum()
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    return df, sorted(df["project"].unique())


def regression_and_stats(df: DataFrame, project: str, y_axis_label: str, plot_path: str, x_column_name: str, y_column_names: list[str]):
    project_df = df[df["project"] == project]

    plt.xlabel("number of satds in code")
    plt.ylabel(y_axis_label)
    plt.title(project)
    
    result = {}
    
    for y_column_name in y_column_names:
        print(y_column_name)
        kendal: SignificanceResult = stats.kendalltau(project_df[y_column_name], project_df[x_column_name])
        spearman: SignificanceResult = stats.spearmanr(project_df[y_column_name], project_df[x_column_name])
        result[y_column_name] = [kendal, spearman]
        print("Kendal",  kendal)
        print("Spearman", spearman)
    
        poly = PolynomialFeatures(degree=3)
        x_poly = poly.fit_transform(project_df[y_column_name].sort_values().to_numpy().reshape(-1, 1))
        y = project_df[x_column_name].to_numpy().reshape(-1, 1)
        model = LinearRegression()
        model.fit(x_poly, y)
        predicted = model.predict(x_poly)
        print("MSE", mean_squared_error(y, predicted))
    
        plt.scatter(project_df[y_column_name], project_df[x_column_name], s=3, label=y_column_name)
        plt.plot(project_df[y_column_name].sort_values(), predicted)
        
    plt.legend()
    plt.savefig(f"{plot_path}/{project}.png")
    plt.show()
    return result

# RQ1

In [None]:
df, projects = load_masterdata()

for project in projects:
    regression_and_stats(df, project, y_axis_label="number of commits per 2 weeks", plot_path="plots/commit_frequency", x_column_name="commits_per_14days", y_column_names=["total_cumsum"])

In [None]:
"""
dependence on release speed
"""

df = pd.read_csv("data/master-dataset.csv")
df.dropna(inplace=True)
df["total_cumsum"] = df["total"].cumsum()

for project in sorted(df["project"].unique()):
    project_df = df[df["project"] == project]
    poly = PolynomialFeatures(degree=3)
    x_poly = poly.fit_transform(project_df["total_cumsum"].to_numpy().reshape(-1, 1))
    y = project_df["releases_per_60days"].to_numpy().reshape(-1, 1)

    model = LinearRegression()
    model.fit(x_poly, y)
    predicted = model.predict(x_poly)
    print("MSE", mean_squared_error(y, predicted))

    plt.xlabel("total number of satds in code")
    plt.ylabel("number of releases per 60 days")
    plt.title(project)
    plt.scatter(project_df["total_cumsum"], project_df["releases_per_60days"], s=3)
    plt.plot(project_df["total_cumsum"], predicted, color="red")
    plt.savefig(f"plots/commit_frequency/{project}.png")
    plt.show()

In [None]:
"""
dependence on exact number of open pull requests
"""

df, projects = load_masterdata()
df = df.dropna()

for project in projects:
    regression_and_stats(df, project, "open_prs_exact", "exact number of open pull requests", "plots/prs-exact")


In [None]:
"""
dependence on number of opened pull requests in a kernel of 7 days
"""

df, projects = load_masterdata()
df = df.dropna()

for project in projects:
    regression_and_stats(df, project, "open_prs_per7", "opened pull requests in a kernel of 7 days", "plots/prs-per7")


# RQ2

In [None]:
"""
dependency on number of commit frequency on all four types of SATD
"""
df, projects = load_masterdata()

df["code|design-debt"] = df["code|design-debt"].cumsum()
df["documentation-debt"] = df["documentation-debt"].cumsum()
df["test-debt"] = df["test-debt"].cumsum()
df["requirement-debt"] = df["requirement-debt"].cumsum()

with open("data/stats-commits.csv", mode="w") as file:
    writer = csv.writer(file)
    writer.writerow(["project", "kendal_v", "kendal_p", "spearman_v", "spearman_p", "y_column", "x_column"])
    x_column = "commits_per_14days"
    for project in projects:
        plt.xscale("log")
        statistic = regression_and_stats(df, project, y_axis_label="number of commits per 2 weeks", plot_path="plots/commit_frequency", x_column_name=x_column, y_column_names=["test-debt", "requirement-debt", "documentation-debt", "code|design-debt"])
        print(statistic)
        for y_column in statistic:
            writer.writerow([project, *statistic[y_column][0], *statistic[y_column][1], y_column, x_column])
    
    

In [None]:
"""
dependency of release frequency on all four types of SATD
"""
df, projects = load_masterdata()

df["code|design-debt"] = df["code|design-debt"].cumsum()
df["documentation-debt"] = df["documentation-debt"].cumsum()
df["test-debt"] = df["test-debt"].cumsum()
df["requirement-debt"] = df["requirement-debt"].cumsum()

df.dropna(inplace=True)

with open("data/stats-releases.csv", mode="w") as file:
    writer = csv.writer(file)
    writer.writerow(["project", "kendal_v", "kendal_p", "spearman_v", "spearman_p", "y_column", "x_column"])
    x_column = "releases_per_60days"
    for project in projects:
        plt.xscale("log")
        statistic = regression_and_stats(df, project, y_axis_label="releases of commits per 60 days", plot_path="plots/release_frequency", x_column_name=x_column, y_column_names=["test-debt", "requirement-debt", "documentation-debt", "code|design-debt"])
        print(statistic)
        for y_column in statistic:
            writer.writerow([project, *statistic[y_column][0], *statistic[y_column][1], y_column, x_column])

In [None]:
"""
dependency of open pull requests on all four types of SATD
"""
df, projects = load_masterdata()

df["code|design-debt"] = df["code|design-debt"].cumsum()
df["documentation-debt"] = df["documentation-debt"].cumsum()
df["test-debt"] = df["test-debt"].cumsum()
df["requirement-debt"] = df["requirement-debt"].cumsum()

df.dropna(inplace=True)

with open("data/stats-prs.csv", mode="w") as file:
    writer = csv.writer(file)
    writer.writerow(["project", "kendal_v", "kendal_p", "spearman_v", "spearman_p", "y_column", "x_column"])
    x_column = "open_prs_exact"
    for project in projects:
        # plt.xscale("log")
        statistic = regression_and_stats(df, project, y_axis_label="releases of commits per 60 days", plot_path="plots/rq2/prs", x_column_name=x_column, y_column_names=["test-debt", "requirement-debt", "documentation-debt", "code|design-debt"])
        print(statistic)
        for y_column in statistic:
            writer.writerow([project, *statistic[y_column][0], *statistic[y_column][1], y_column, x_column])