In [None]:
import pandas as pd
import io
import csv

In [None]:
rows = []
with open('/home/mariya/Documents/Year 4/Data Mining/analysis/aggregated_repo_metrics_with_types.csv', 'r') as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        if len(row) > len(header):
            print(f"Row with extra fields: {row[0]}")
            row = row[:len(header)]  # Truncate extra fields
        elif len(row) < len(header):
            print(f"Row with missing fields: {row[0]}")
            row = row + [None] * (len(header) - len(row))  # Pad with None
        rows.append(row)

In [None]:
df = pd.DataFrame(rows, columns=header)

In [None]:
# Convert 'Lines of Code/Size' to numeric
df['Lines of Code'] = df['Lines of Code/Size'].str.extract('(\d+)').astype(float)
df['Average Commit Timestamp (Epoch Sec)'] = pd.to_numeric(df['Average Commit Timestamp (Epoch Sec)'], errors='coerce')
df['Average Refactoring Commit Timestamp (Epoch Sec)'] = pd.to_numeric(df['Average Refactoring Commit Timestamp (Epoch Sec)'], errors='coerce')

In [None]:
# Separate dataframes for AI-Coauthored and Human Written repos
ai_repos_df = df[df['Human/AI-Coauthored'] == 'AI-Coauthored']
human_repos_df = df[df['Human/AI-Coauthored'] == 'Human Written']

In [None]:
def descriptive_stats(dataframe, group_column, metrics_columns):
    """Calculates descriptive statistics for specified metrics, grouped by a column."""
    grouped = dataframe.groupby(group_column)[metrics_columns]
    stats = grouped.agg(['count', 'mean', 'median', 'std'])
    return stats

In [None]:
metrics_to_analyze = [
    'Total Refactorings', 'Refactoring Commits Percentage', 'Average Time-to-Refactor (sec)',
    'Refactoring Timestamp Difference (days)', 'Number of Refactoring Contributors',
    'Naming Improvements',
    'Parameter Modifications',
    'Method Composition',
    'Method Movement',
]
for column in metrics_to_analyze:
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [None]:
# Debug info to verify data types
print(df['Human/AI-Coauthored'].value_counts())

In [None]:
descriptive_stats_df = descriptive_stats(df, 'Human/AI-Coauthored', metrics_to_analyze)
print(descriptive_stats_df)

In [None]:
# Save the descriptive statistics to a CSV file
descriptive_stats_df.to_csv('/home/mariya/Documents/Year 4/Data Mining/analysis/descriptive_stats.csv')
print("Descriptive statistics saved to descriptive_stats.csv")

In [None]:
flat_descriptive_stats = descriptive_stats_df.copy()
flat_descriptive_stats.columns = ['_'.join(col).strip() for col in flat_descriptive_stats.columns.values]
flat_descriptive_stats.to_csv('/home/mariya/Documents/Year 4/Data Mining/analysis/descriptive_stats_flat.csv')
print("Flattened descriptive statistics saved to descriptive_stats_flat.csv")