In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pandas import DataFrame
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures


def load_masterdata():
    """
    loads the master dataset and converts the timestamp into a pandas interpretable datetime
    :return: the dataframe, a list of all projects from the dataframe
    """
    df = pd.read_csv("data/master-dataset.csv")
    df["total_cumsum"] = df["total"].cumsum()
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    return df, sorted(df["project"].unique())


def regression_and_stats(df: DataFrame, project: str, column_name: str, y_axis_label: str, plot_path: str):
    project_df = df[df["project"] == project]

    print("Kendal", stats.kendalltau(project_df["total_cumsum"], project_df[column_name]))
    print("Spearman", stats.spearmanr(project_df["total_cumsum"], project_df[column_name]))

    poly = PolynomialFeatures(degree=3)
    x_poly = poly.fit_transform(project_df["total_cumsum"].sort_values().to_numpy().reshape(-1, 1))
    y = project_df[column_name].to_numpy().reshape(-1, 1)
    model = LinearRegression()
    model.fit(x_poly, y)
    predicted = model.predict(x_poly)
    print("MSE", mean_squared_error(y, predicted))

    plt.xlabel("total number of satds in code")
    plt.ylabel(y_axis_label)
    plt.title(project)
    plt.scatter(project_df["total_cumsum"], project_df[column_name], s=3)
    plt.plot(project_df["total_cumsum"].sort_values(), predicted, color="red")
    plt.savefig(f"{plot_path}/{project}.png")
    plt.show()

In [None]:
df, projects = load_masterdata()

range(df["total_cumsum"].min(), df["total_cumsum"].max())

In [None]:
df, projects = load_masterdata()

for project in projects:
    regression_and_stats(df, project, "commits_per_14days", "number of commits per 2 weeks", "plots/commit_frequency")

In [None]:
"""
dependence on release speed
"""

df = pd.read_csv("data/master-dataset.csv")
df.dropna(inplace=True)
df["total_cumsum"] = df["total"].cumsum()

for project in sorted(df["project"].unique()):
    project_df = df[df["project"] == project]
    poly = PolynomialFeatures(degree=3)
    x_poly = poly.fit_transform(project_df["total_cumsum"].to_numpy().reshape(-1, 1))
    y = project_df["releases_per_60days"].to_numpy().reshape(-1, 1)

    model = LinearRegression()
    model.fit(x_poly, y)
    predicted = model.predict(x_poly)
    print("MSE", mean_squared_error(y, predicted))

    plt.xlabel("total number of satds in code")
    plt.ylabel("number of releases per 60 days")
    plt.title(project)
    plt.scatter(project_df["total_cumsum"], project_df["releases_per_60days"], s=3)
    plt.plot(project_df["total_cumsum"], predicted, color="red")
    plt.savefig(f"plots/commit_frequency/{project}.png")
    plt.show()

In [None]:
"""
dependence on exact number of open pull requests
"""

df, projects = load_masterdata()
df = df.dropna()

for project in projects:
    regression_and_stats(df, project, "open_prs_exact", "exact number of open pull requests", "plots/prs-exact")


In [None]:
"""
dependence on number of opened pull requests in a kernel of 7 days
"""

df, projects = load_masterdata()
df = df.dropna()

for project in projects:
    regression_and_stats(df, project, "open_prs_per7", "opened pull requests in a kernel of 7 days", "plots/prs-per7")
