In [None]:
import os
import json
import pandas as pd
from neo4j import GraphDatabase
from pandas import DataFrame

In [None]:
dir = ""

In [None]:
def list_directories(path):
    # List only directories
    directories = [os.path.join(path, d) for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
    return directories

def list_files(directory):
    # List only files
    files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    return files

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)  # Parse JSON data
    return data

def convert_dependency_format_to_list(data):
    result = []
    for dependency in data:
        if ":" in dependency['artifact']:
            converted_artifact = ":".join(dependency['artifact'].split(':')[0:2])
            result.append(converted_artifact)
    return list(set(result))        

In [None]:
from datetime import datetime
import ast

class ComparedResult:
    def __init__(self, csv_file):
        self.csv_file = csv_file
        self.df = pd.read_csv(csv_file)    
        
    def get_timestamp(self, release_id):
        artifact_id = ":".join(release_id.split(":")[0:2])
        version = release_id.split(":")[-1]
        row = self.df[self.df['artifact_id'] == artifact_id]
        row = row.iloc[0]
        tags = ast.literal_eval(row['tags'])
        versions = ast.literal_eval(row['versions'])
        
        compared_result_list = []
        for entry in row["compared_result"]:
            compared_result_list.append(entry[0])
            
        for entry in versions:
            if entry["name"] == version:
                return entry["ts"]
    
        # Check in older_tags
        for entry in tags:
            if entry["name"] == version:
                iso_date = entry["date"]
                timestamp = int(datetime.strptime(iso_date, "%Y-%m-%dT%H:%M:%SZ").timestamp()) * 1000
                return timestamp
    
        # If not found
        return None
    
    def get_first_timestamp(self, release_id):
        artifact_id = ":".join(release_id.split(":")[0:2])
        version = release_id.split(":")[-1]
        row = self.df[self.df['artifact_id'] == artifact_id]
        row = row.iloc[0]
        tags = ast.literal_eval(row['tags'])
        versions = ast.literal_eval(row['versions'])   
        
        success_file = [file.split(".json")[0] for file in list_files(dir + "/" + artifact_id)]
            
        version_with_ts = []
        for entry in versions:
            if entry["name"] in success_file:
                version_with_ts.append({"name": entry["name"], "ts": int(entry["ts"])})
    
        # Check in older_tags
        for entry in tags:
            if entry["name"] in success_file:
                iso_date = entry["date"]
                ts = int(datetime.strptime(iso_date, "%Y-%m-%dT%H:%M:%SZ").timestamp()) * 1000
                if not any(existing_entry["name"] == entry["name"] for existing_entry in version_with_ts):
                    version_with_ts.append({"name": entry["name"], "ts": ts})
            
        sorted_version = sorted(version_with_ts, key=lambda x: x["ts"])
    
        # If not found
        return sorted_version[0]["ts"]
    
    def get_current_version_index(self, release_id):
        artifact_id = ":".join(release_id.split(":")[0:2])
        version = release_id.split(":")[-1]
        row = self.df[self.df['artifact_id'] == artifact_id]
        row = row.iloc[0]
        tags = ast.literal_eval(row['tags'])
        versions = ast.literal_eval(row['versions'])   
        
        success_file = [file.split(".json")[0] for file in list_files(dir + "/" + artifact_id)]
            
        version_with_ts = []
        for entry in versions:
            if entry["name"] in success_file:
                version_with_ts.append({"name": entry["name"], "ts": int(entry["ts"])})
    
        # Check in older_tags
        for entry in tags:
            if entry["name"] in success_file:
                iso_date = entry["date"]
                ts = int(datetime.strptime(iso_date, "%Y-%m-%dT%H:%M:%SZ").timestamp()) * 1000
                if not any(existing_entry["name"] == entry["name"] for existing_entry in version_with_ts):
                    version_with_ts.append({"name": entry["name"], "ts": ts})
            
        sorted_version = sorted(version_with_ts, key=lambda x: x["ts"])
        for i, entry in enumerate(sorted_version):
            if entry["name"] == version:
                return i          
        return None

In [None]:
class Neo4jDriver:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_timestamp(self, release_id):
        query = """
        MATCH (r:Release {id: $release_id})
        RETURN r.timestamp
        """
        with self.driver.session() as session:
            # Execute the query and return the results
            results = session.run(query, release_id=release_id)
            return [record["r.timestamp"] for record in results][0]

In [None]:
class ProjectReport:
    artifact = ""
    dir = ""
    timestamp = ""
    introduce_unused_dependency = []
    introduce_unused_dependency_but_import = []
    unused_previously_used = []
    unused_previously_used_but_import = []
    unused_dependency = []
    use_dependency = []
    use_transitive_dependency = []
    
    def __init__(self, artifact, introduce_unused_dependency, introduce_unused_dependency_but_import, unused_previously_used, unused_previously_used_but_import, unused_dependency, unused_but_import, use_dependency, use_transitive_dependency):
        self.artifact = artifact
        self.introduce_unused_dependency = introduce_unused_dependency
        self.introduce_unused_dependency_but_import = introduce_unused_dependency_but_import
        self.unused_previously_used = unused_previously_used
        self.unused_previously_used_but_import = unused_previously_used_but_import
        self.unused_dependency = unused_dependency
        self.unused_but_import = unused_but_import
        self.use_dependency = use_dependency
        self.use_transitive_dependency = use_transitive_dependency
        
    def convert_to_df(self):
        artifact_id = ":".join(self.artifact.split(":")[0:2])
        version = self.artifact.split(":")[2]
        data = {
            "artifact": artifact_id,
            "version": version,
            "introduce_unused_dependency": len(self.introduce_unused_dependency),
            "introduce_unused_dependency_but_import": len(self.introduce_unused_dependency_but_import),
            "introduce_unused_previously_used": len(self.unused_previously_used),
            "introduce_unused_but_import_previously_used": len(self.unused_previously_used_but_import),
            "unused_dependency": len(self.unused_dependency),
            "unused_dependency_but_import": len(self.unused_but_import),
            "use_dependency": len(self.use_dependency),
            "use_transitive_dependency": len(self.use_transitive_dependency),
        }
        return pd.DataFrame(data, index=[0])
        

In [None]:
list_directories(dir)

In [None]:
success_project = pd.read_csv(dir + "/success_project.csv")
# success_project = pd.read_csv("temp.csv")
success_path_list = []
for index, row in success_project.iterrows():
    success_path_list.append(row["artifact_id"])

success_path_list

In [None]:
len(success_path_list)

In [None]:
success_project

In [None]:
# success_project.to_csv("temp.csv")

In [None]:
# driver = Neo4jDriver("bolt://localhost:7687", "neo4j", "12345678")
filtered_directories = []
compare_result = ComparedResult("")

for directory in list_directories(dir):
    if (directory.split("/")[-1] in success_path_list):
        filtered_directories.append(directory)

project_result = {}
count = 0
for project_dir in filtered_directories:
    list_of_files = list_files(project_dir)
    data_with_timestamp = []
    project_report = []
    project_id = ":".join(read_json_file(os.path.join(project_dir, list_of_files[0]))["projectArtifact"].split(":")[0:2])
    for file in list_of_files:
        data = read_json_file(os.path.join(project_dir, file))
        artifact_id = data["projectArtifact"]
        timestamp = compare_result.get_timestamp(artifact_id)
        data_with_timestamp.append((data, timestamp))
    data_with_timestamp.sort(key=lambda x: x[1])
    
    for index, data in enumerate(data_with_timestamp):
        current_data = data[0]
        artifact_id = current_data["projectArtifact"]
        current_unused_dependency = current_data["projectUnusedDependencies"]
        current_use_dependency = current_data["projectUseDependencies"]
        current_use_transitive_dependency = current_data["projectUseTransitiveDependencies"]
        converted_unused_dependency = list(set(convert_dependency_format_to_list(current_unused_dependency)))
        converted_use_dependency = list(set(convert_dependency_format_to_list(current_use_dependency)))
        converted_use_transitive_dependency = list(set(convert_dependency_format_to_list(current_use_transitive_dependency)))
        file_import_reports = current_data["fileImportReports"]
        
        unused_import_artifact = []
        for import_report in file_import_reports:
            unused_import_report = import_report["unusedImportReport"]
            for unused_import in unused_import_report:
                artifact = unused_import["artifact"]
                result = convert_dependency_format_to_list([artifact])
                if len(result) > 0:
                    unused_import_artifact.append(result[0])
                    
        introduce_unused_but_import = []
        unused_but_import_previously_used = []
        
        # Count unused but imported
        unused_but_imported = []
        for unused_dependency in converted_unused_dependency:
            if unused_dependency in unused_import_artifact:
                unused_but_imported.append(unused_dependency)
                
        
        # Count introduce unused dependency
        introduce_unused = []
        unused_previously_used = []
        if index == 0:
            # First version
            for unused_dep in converted_unused_dependency:
                introduce_unused.append(unused_dep)
                if unused_dep in unused_import_artifact:
                    introduce_unused_but_import.append(unused_dep)
        else:
            previous_data = data_with_timestamp[index - 1][0]
            previous_unused_dependency = previous_data["projectUnusedDependencies"]
            previous_use_dependency = previous_data["projectUseDependencies"]
            previous_use_transitive_dependency = previous_data["projectUseTransitiveDependencies"]
            all_previous_dependency = previous_unused_dependency + previous_use_dependency + previous_use_transitive_dependency
            all_previous_use_dependency = previous_use_dependency + previous_use_transitive_dependency
            converted_all_previous_dependency = list(set(convert_dependency_format_to_list(all_previous_dependency)))
            converted_all_previous_use_dependency = list(set(convert_dependency_format_to_list(all_previous_use_dependency)))
            
            for unused_dep in converted_unused_dependency:
                if unused_dep not in converted_all_previous_dependency:
                    introduce_unused.append(unused_dep)
                    if unused_dep in unused_import_artifact:
                        introduce_unused_but_import.append(unused_dep)
                elif unused_dep in converted_all_previous_use_dependency:
                    unused_previously_used.append(unused_dep)
                    if unused_dep in unused_import_artifact:
                        unused_but_import_previously_used.append(unused_dep)
        
        report = ProjectReport(artifact_id, introduce_unused, introduce_unused_but_import, unused_previously_used, unused_but_import_previously_used, converted_unused_dependency, unused_but_imported, converted_use_dependency, converted_use_transitive_dependency)
        project_report.append(report)
    project_result[project_id] = project_report

print(len(project_result))

In [None]:
df_list = []

for k, v in project_result.items():
    for project in v:
        df_list.append(project.convert_to_df())

concat_df = pd.concat(df_list, axis=0, ignore_index=True)
concat_df["all_introduce"] = concat_df["introduce_unused_dependency"] + concat_df["introduce_unused_previously_used"]
concat_df["all_dependency"] = concat_df["use_dependency"] + concat_df["use_transitive_dependency"] + concat_df["unused_dependency"]
concat_df

In [None]:
concat_df_result = concat_df.groupby("artifact").agg(
    introduce_unused_dependency_sum=pd.NamedAgg(column="introduce_unused_dependency", aggfunc="sum"),
    introduce_unused_dependency_but_import_sum=pd.NamedAgg(column="introduce_unused_dependency_but_import", aggfunc="sum"),
    introduce_unused_previously_use = pd.NamedAgg(column="introduce_unused_previously_used", aggfunc="sum"),
    introduce_unused_but_import_previously_used = pd.NamedAgg(column="introduce_unused_but_import_previously_used", aggfunc="sum"),
    total_unused_dependency = pd.NamedAgg(column="unused_dependency", aggfunc="sum"),
    total_unused_dependency_but_import = pd.NamedAgg(column="unused_dependency_but_import", aggfunc="sum"),
    total_use_transitive_dependency = pd.NamedAgg(column="use_transitive_dependency", aggfunc="sum"),
    median_unused_dependency = pd.NamedAgg(column="unused_dependency", aggfunc="median"),
    average_unused_dependency=pd.NamedAgg(column="unused_dependency", aggfunc="mean"),
    average_unused_dependency_but_import = pd.NamedAgg(column="unused_dependency_but_import", aggfunc="mean"),
    average_use_transitive_dependency=pd.NamedAgg(column="use_transitive_dependency", aggfunc="mean"),
    dependency_count = pd.NamedAgg(column="all_dependency", aggfunc="mean"),
    version_count=pd.NamedAgg(column="version", aggfunc="count"),
)

concat_df_result["version_count"] = concat_df_result["version_count"]

# Display the result
# concat_df_result.to_csv("data/temp.csv")
concat_df_result

In [None]:
statistics = {
    "min": concat_df_result.min(numeric_only=True),
    "mean": concat_df_result.mean(numeric_only=True),
    "median": concat_df_result.median(numeric_only=True),
    "max": concat_df_result.max(numeric_only=True),
    "total": concat_df_result.sum(numeric_only=True),
}
stats_df = pd.DataFrame(statistics)

# Display the statistics table
stats_df

In [None]:
statistics = {
    "min": concat_df.min(numeric_only=True),
    "mean": concat_df.mean(numeric_only=True),
    "median": concat_df.median(numeric_only=True),
    "max": concat_df.max(numeric_only=True),
    "total": concat_df.sum(numeric_only=True),
}

stats_df = pd.DataFrame(statistics)

# Display the statistics table
stats_df

In [None]:
group_df = concat_df.groupby("artifact").sum()
print(len(group_df[group_df["all_introduce"] > 0])/len(group_df))

In [None]:
len(concat_df[concat_df["all_introduce"] > 0])/len(concat_df)

In [None]:
print(len(group_df))

In [None]:
rq2_project_result = {}
for k, v in project_result.items():
    if k in concat_df["artifact"].tolist():
        rq2_project_result[k] = v

## RQ2

In [None]:
class ArtifactExistReport:
    def __init__(self, project_id, artifact_id, from_ts, to_ts, from_version, to_version, version_different, last_state):
        self.project_id = project_id
        self.artifact_id = artifact_id
        self.from_ts = from_ts
        self.to_ts = to_ts
        self.last_state = last_state
        self.from_version = from_version
        self.to_version = to_version
        self.version_different = version_different
        
    def convert_to_df(self):
        duration = None
        if self.to_ts != None:
            duration = self.to_ts - self.from_ts
        data = {
            "project_id": self.project_id,
            "artifact": self.artifact_id,
            "from_ts": self.from_ts,
            "to_ts": self.to_ts,
            "last_state": self.last_state,
            "from_version": self.from_version,
            "to_version": self.to_version,
            "version_different": self.version_different,
            "duration": duration,
        }
        return pd.DataFrame(data, index=[0])
    

In [None]:
# driver = Neo4jDriver("bolt://localhost:7687", "neo4j", "12345678")


def find_evolution(current_index, last_index, project_result, unused_artifact_id, project_id):
    from_ts = project_result[current_index].timestamp
    from_version = project_result[current_index].artifact.split(":")[2]
    for i in range(current_index + 1, last_index + 1):
        current_project_result = project_result[i]
        if unused_artifact_id in current_project_result.unused_dependency:
            continue
        if unused_artifact_id in current_project_result.use_dependency or unused_artifact_id in current_project_result.use_transitive_dependency:
            to_version = current_project_result.artifact.split(":")[2]
            return ArtifactExistReport(project_id, unused_artifact_id, from_ts, current_project_result.timestamp, from_version, to_version, i - current_index, "use later")
        to_version = current_project_result.artifact.split(":")[2]
        return ArtifactExistReport(project_id, unused_artifact_id, from_ts, current_project_result.timestamp, from_version, to_version, i - current_index, "removed")
    return ArtifactExistReport(project_id, unused_artifact_id, from_ts, None, from_version, None, None, "stay there")
        

how_long_exist = {}
for project_id, project_reports in rq2_project_result.items():
    result = []
    for project_report in project_reports:
        ts =  compare_result.get_timestamp(project_report.artifact)
        project_report.timestamp = ts
        
    last_index = len(project_reports) - 1
    for index, project_report in enumerate(project_reports):
        for current_unused in project_report.introduce_unused_dependency:
            result.append(find_evolution(index, last_index, project_reports, current_unused, project_id))
        for current_unused in project_report.unused_previously_used:
            result.append(find_evolution(index, last_index, project_reports, current_unused, project_id))
    if len(result) > 0:
        how_long_exist[project_id] = result

In [None]:
how_long_exist

In [None]:
report_df_list = []

for k, v in how_long_exist.items():
    for artifact_report in v:
        report_df_list.append(artifact_report.convert_to_df())
        
artifact_reports = pd.concat(report_df_list, axis=0, ignore_index=True)
artifact_reports

In [None]:
group_by_state_artifact_report = artifact_reports.groupby("last_state").agg(
    count=pd.NamedAgg(column="artifact", aggfunc="count"),
)

In [None]:
group_by_state_artifact_report

# Find first introduction date

In [None]:
from datetime import datetime

In [None]:
result = []
compare_result = ComparedResult("data_with_date/success-compared-result-2.csv")
for project_id, project_reports in rq2_project_result.items():
    for project_report in project_reports:
        ts =  compare_result.get_timestamp(project_report.artifact)
        project_report.timestamp = ts
        
    for index, project_report in enumerate(project_reports):
        current_index = compare_result.get_current_version_index(project_report.artifact)
        first_ts = compare_result.get_first_timestamp(project_report.artifact)
        for current_unused in project_report.introduce_unused_dependency:
            result.append({"project_id": project_id, "artifact_id": project_report.artifact, "unused_dependency": current_unused, "timestamp": project_report.timestamp, "first_timestamp": first_ts, "version": current_index + 1})
        for current_unused in project_report.unused_previously_used:
            result.append({"project_id": project_id, "artifact_id": project_report.artifact, "unused_dependency": current_unused, "timestamp": project_report.timestamp, "first_timestamp": first_ts, "version": current_index + 1})
    
    

In [None]:
df = pd.DataFrame(result)
df

In [None]:
for index, row in df.iterrows():
    first_ts = datetime.fromtimestamp(row["first_timestamp"]/1000)
    current_ts = datetime.fromtimestamp(row["timestamp"]/1000)
    diff_time = current_ts - first_ts
    df.at[index, "diff_days"] = diff_time.days
    
    year_diff = current_ts.year - first_ts.year
    month_diff = current_ts.month - first_ts.month
    total_month_diff = year_diff * 12 + month_diff
    
    df.at[index, "diff_year"] = year_diff + 1
    df.at[index, "diff_month"] = total_month_diff

In [None]:
df

In [None]:
len(df[df["diff_year"] == 1])/len(df) * 100

In [None]:
len(df[df["version"] == 1]) / len(df) * 100

## Resolved artifact report

In [None]:
resolved_artifact_report = artifact_reports[artifact_reports["last_state"].isin(["removed", "use later"])]
resolved_artifact_report['duration_day'] = resolved_artifact_report['duration']/(1000 * 60 * 60 * 24)
resolved_artifact_report = resolved_artifact_report[["duration_day", "version_different"]]
resolved_artifact_report

In [None]:
stats_table = resolved_artifact_report.agg(['min', 'mean', 'median', 'max'])

stats_table_transposed = stats_table.T
stats_table_transposed

## Removed artifact report

In [None]:
artifact_reports[artifact_reports["last_state"] == "removed"]

In [None]:
removed_artifact_report = artifact_reports[artifact_reports["last_state"] == "removed"]
removed_artifact_report['duration_day'] = removed_artifact_report['duration']/(1000 * 60 * 60 * 24)
removed_artifact_report = removed_artifact_report[["duration_day", "version_different"]]
removed_artifact_report

In [None]:
removed_artifact_report.to_csv("removed_artifact_report.csv")

In [None]:
stats_table = removed_artifact_report.agg(['min', 'mean', 'median', 'max'])

stats_table_transposed = stats_table.T
stats_table_transposed

## Use later artifact report

In [None]:
artifact_reports[artifact_reports["last_state"] == "use later"]

In [None]:
use_later_artifact_report = artifact_reports[artifact_reports["last_state"] == "use later"]
use_later_artifact_report['duration_day'] = use_later_artifact_report['duration']/(1000 * 60 * 60 * 24)
use_later_artifact_report = use_later_artifact_report[["duration_day", "version_different"]]
use_later_artifact_report

In [None]:
use_later_artifact_report.to_csv("use_later_artifact_report.csv")

In [None]:
stats_table = use_later_artifact_report.agg(['min', 'mean', 'median', 'max'])

stats_table_transposed = stats_table.T
stats_table_transposed

## Mann-Whiteney U test

In [None]:
!pip install scipy

In [None]:
import scipy.stats._mannwhitneyu

In [None]:
from scipy.stats import mannwhitneyu
u_statictic, p_value = mannwhitneyu(use_later_artifact_report['duration_day'].tolist(), removed_artifact_report['duration_day'].tolist(), alternative='two-sided')
print(u_statictic, p_value)

In [None]:
u_statictic, p_value = mannwhitneyu(use_later_artifact_report['version_different'].tolist(), removed_artifact_report['version_different'].tolist(), alternative='two-sided')
print(u_statictic, p_value)

## Plot graph

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


version_different_violin_plot = DataFrame({
    "Never used and removed": removed_artifact_report["version_different"],
    "Use later": use_later_artifact_report["version_different"],
})

sns.set_theme(style="whitegrid")
sns.violinplot(data=version_different_violin_plot)
plt.grid(axis='y', linestyle='--', alpha=0.5)
sns.despine()
plt.ylim(0, None)
plt.tight_layout()
plt.show()

In [None]:
version_duration_plot = DataFrame({
    "Never used and removed": removed_artifact_report["duration_day"],
    "Use later": use_later_artifact_report["duration_day"],
})

sns.violinplot(data=version_duration_plot)
plt.ylim(0, None)
plt.show()

In [None]:
removed_artifact_report[removed_artifact_report["version_different"] < 0]