In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.preprocessing import MinMaxScaler

In [None]:
def read_csv(file):
    return pd.read_csv(file, names=['RESOLUTION', 'CHM', 'CHD', 'IFN', 'IRN', 'OPN', 'SMQ', 'SCOH', 'SCOP', 'CMQ', 'CCOH', 'CCOP', 'SERVICES'])
df = read_csv('398907877__AppPortal_21_07_22_06_56_K12.csv')

df

In [None]:
# Normalize data
df_norm = df.copy()

def normalize(df):
    df_min = df.min()
    df_max = df.max()
    if df_min != df_max: 
        df = df.apply(lambda x : (x - df_min) /(df_max - df_min))
    else:
        df = df.apply(lambda x : 1)
        #print(f"[WARNING] Equal values on normalization")
    return df

def normalize_data(df):
    df['IFN'] = normalize(df['IFN'])
    df['IRN'] = normalize(df['IRN'])
    df['SMQ'] = normalize(df['SMQ'])
    df['CMQ'] = normalize(df['CMQ'])

    return df

    
df_norm = normalize_data(df_norm)


In [None]:
# Adjust values of IFN, IRN, SCOP and CCOP to 1-x, (lower values mean better results) 
def adjust_values(df):
    df['IFN'] = df['IFN'].apply(lambda x: 1-x)
    df['IRN'] = df['IRN'].apply(lambda x: 1-x)
    df['SCOP'] = df['SCOP'].apply(lambda x: 1-x)
    df['CCOP'] = df['CCOP'].apply(lambda x: 1-x)
    return df
df_norm = adjust_values(df_norm)
df_norm

In [None]:
def calculate_total(df):
    df['TOTAL'] = (df.loc[:,'IFN':'IRN'].sum(axis=1) + df.loc[:,'SMQ'] + df.loc[:,'CMQ'])
    df['Total'] = (0.1 * df.loc[:,'CHM']) + (0.1 * df.loc[:,'CHD']) + (0.2 * df.loc[:,'IFN']) + (0.2 * df.loc[:,'IRN']) + (0.2 * df.loc[:,'SMQ']) + (0.2 * df.loc[:,'CMQ'])
    df['Total'] = df.loc[:,'SCOH'] + df.loc[:,'SCOP'] + df.loc[:,'CCOH'] + df.loc[:,'CCOP']
    return df

df_norm = calculate_total(df_norm)
df_norm


In [None]:
def choose_best_iteration_for_resolution(df_norm):
    best = {}
    drop_indexes = []
    for index, row in df_norm.iterrows():
        res = row['RESOLUTION']
        total = row['TOTAL']

        if res in best:
            if total >= best[res][1]:
                drop_indexes.append(best[res][0])
                best[res] = (index, total)
            else:
                drop_indexes.append(index)
        else:
            best[res] = (index, total)


    df_norm = df_norm.drop(df_norm.index[drop_indexes]).reset_index(drop=True)
    return df_norm

df_norm = choose_best_iteration_for_resolution(df_norm)
df_norm
            
        
        

In [None]:
def plot(df_norm):
    x = df_norm['RESOLUTION']
    y = df_norm['TOTAL']
    
    fig, (ax1, ax2) = plt.subplots(1,2)
    
    
    ax1.plot(x,y)
    z = np.polyfit(x, y, 3)
    p = np.poly1d(z)
    ax1.plot(x,p(x),'r--')
    
    
    ax2.plot(x, df_norm['CHM'], label='CHM')
    ax2.plot(x, df_norm['CHD'], label='CHD')
    ax2.plot(x, df_norm['CMQ'], label='CMQ')
    ax2.plot(x, df_norm['SMQ'], label='SMQ')
    ax2.plot(x, df_norm['IRN'], label='IRN')
    ax2.plot(x, df_norm['IFN'], label='IFN')

    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
    #plt.savefig('jpetstore.png', bbox_inches='tight')
    plt.show()
    


plot(df_norm)

In [None]:
projects_data = pd.read_csv('merged_data.csv')

# Plot metrics
# Read all csv
total = 0
csvs = []
dir = '.'
for cur, directories, files in os.walk(dir):
    if cur == dir: # Current directory only
        for f in files:
            if 'csv' in f and 'merged_data' not in f:
                total += 1
                csvs.append(f)

executed_projects = set()
for f in csvs:
    try:
        project_name = re.split(r'_\d', f)[0]
        if project_name in executed_projects:
            continue
        executed_projects.add(project_name)
        project_data = projects_data[projects_data.name.eq(project_name.replace('__','/'))]
        print(f"\n\n\nMetrics for {project_name}")
        print(f"Total classes: {project_data.classes.item()}")
        print(f"Total controllers: {project_data.controllers.item()}")
        print(f"Open issues : {project_data.open_issues.item()}")
        print(f"Stars : {project_data.stars.item()}")
        

        df = read_csv(f)
        df = normalize_data(df)
        df = adjust_values(df)
        df = calculate_total(df)
        df = choose_best_iteration_for_resolution(df)
        plot(df)
        print(f"Max val resolution: {df.loc[df['TOTAL'].idxmax()].RESOLUTION}")
    except KeyError as e:
        print(f"KeyError on {project_name}")
        
# projects_data[projects_data.name.eq('AnonymousCyberWarrior/guoan_interface_1.0')]



In [None]:
# Identify projects and histogram
projects = {re.split(r'_\d', p)[0] for p in set(next(os.walk('/home/mbrito/git/thesis/data/metrics_backup'))[2]) if 'csv' in p and 'merged' not in p}
projects_data = pd.read_csv('merged_data.csv')

print(len(csvs))                
for f in csvs:
    try:
        project_name = re.split(r'_\d', f)[0]
    except TypeError as e:
        print(f"[ERROR] Failed for project: {project_name}") # should only happen for other csvs in the folder other than metrics

projects = {p.replace('__', '/') for p in projects}
final_projects = set()
drop_indexes = []

# Identify and remove the projects that did not get executed due to parser complications (projects that did not compile or had other issues)
for index, row in projects_data.iterrows():
    if row['name'] not in projects:        
        drop_indexes.append(index)
    else:
        final_projects.add(row['name'])

    

# Filter out the top 10% outliers
projects_data = projects_data.drop(projects_data.index[drop_indexes]).reset_index(drop=True)
#q = projects_data["classes"].quantile(0.9)
# projects_hist = projects_data.loc[(projects_data.classes < q)]
projects_data

## Histogram of projects by classes

In [None]:
# USE THIS ONE AS HISTOGRAM
ran = np.arange(30,800,75)
#ran = np.append(ran, [2600])
print(ran)
#plt.hist(projects_data['classes'], bins=ran, align='left',rwidth=0.8, color='steelblue')
_, bins, patches = plt.hist(np.clip(projects_data['classes'], ran[0], ran[-1]), bins=ran, rwidth=0.9)

labels = []
it = iter(list(ran))
min = next(it)
for max in it:
    print(f"{min} {max}")    
    labels.append(f"{min} -\n{max}")
    min = max

xlabels = bins[1:].astype(str)
xlabels[-1] += '+'

labels = labels[:-1]
labels.append('705+')

rans_2 = [r + 35 for r in ran]
plt.xticks(rans_2, labels, fontsize=8)
plt.ylabel('Frequency')
plt.xlabel('Classes', labelpad=5)
plt.xlim([0, 800])


plt.savefig('new_histogram.png')
plt.show()


## Data analysis by range of classes

In [None]:
metrics = {re.split(r'_\d', p)[0] : p for p in set(next(os.walk('/home/mbrito/git/thesis/data/metrics'))[2]) if 'csv' in p and 'merged' not in p}
def copy_items(df_dst, index, df_src, labels):
    for l in labels:
        df_dst.loc[index, l] = df_src[l].item()
        
for index, row in projects_data.iterrows():
    name = row['name'].replace('/', '__')
    if name in metrics:
        df = read_csv(f"/home/mbrito/git/thesis/data/metrics/{metrics[name]}")
        df_temp = df.copy()
        df_temp = normalize_data(df_temp)
        df_temp = adjust_values(df_temp)
        
        df_temp = calculate_total(df_temp)
        df_temp = choose_best_iteration_for_resolution(df_temp)
        
        df_best = df_temp.sort_values('TOTAL', ascending=False)[0:1]
        copy_items(projects_data, index, df.iloc[df_best.index.item()], ['RESOLUTION', 'CHM', 'CHD', 'IRN', 'IFN', 'OPN', 'CMQ', 'SMQ', 'SERVICES'])
        
def get_df_in_range(df, col, min, max):
    return df[(df[col] > min) & (df[col] <= max)].sort_values(col, ascending=False)

# Create the actual groups
ran = np.arange(0,1001, 150)
ran = np.append(ran, [1000,10000])

it = iter(list(ran))
mi = next(it)
df_groups = []
for ma in it:
    #print(f"{mi} {ma}")
    df_group = get_df_in_range(projects_data, 'classes', mi, ma)
    df_groups.append(([mi,ma], df_group))
    mi = ma    

for ran, df in df_groups:
    print(f"Class range {ran[0]} - {ran[1]} - {len(df)} applications")
    df.boxplot(column=['CHM', 'CHD', 'SMQ', 'CMQ'])
    plt.show()
    
projects_data.sort_values('SMQ')

print(f"Overall boxplot")
projects_data.boxplot(column=['CHM', 'CHD', 'SMQ', 'CMQ'])
plt.savefig('boxplot.png')





In [None]:
projects_data.boxplot(column=['IFN'])
plt.savefig('boxplot_ifn.png')


# Correlance analysis 
Identify ratio of classes/methods and correlate it to metrics

In [None]:
import json

projects = projects_data.copy()
project_stats_base_dir = "/home/mbrito/git/thesis/data/projectstats"
error = []

def get_project_data(file): 
    # print(f"Ratio for {file}")
    try:
        with open(file) as json_file:
            data = json.load(json_file)
            project_total_classes = data['totalClasses']
            project_method_declarations = 0
            project_method_invocations = 0
            project_total_dependencies = 0
            project_total_unique_dependencies = 0
            
            for class_name, classe in data['classes'].items():
                method_invocations = list(filter(lambda c : c != class_name, classe['methodInvocations']))               
                project_method_declarations += classe['methodDeclarations']
                project_method_invocations += len(method_invocations)
                project_total_dependencies += classe['totalDependencies'] 
                project_total_unique_dependencies += classe['totalUniqueDependencies'] 

            if project_method_declarations == 0:
                raise Exception("File does not contain any method declarations")
                error.append(file)

            return project_total_classes, project_method_declarations, project_method_invocations, \
                    project_total_dependencies, project_total_unique_dependencies
    except FileNotFoundError:
        error.append(f"File Not Found: {file}")
    
    
for index, row in projects.iterrows():
    name = row['name'].replace('/','__')
    classes, method_declarations, method_invocations, dependencies, unique_dependencies = get_project_data(f"{project_stats_base_dir}/{name}")
    projects.loc[index, 'ratio1'] = classes / method_declarations
    projects.loc[index, 'ratio2'] = classes / method_invocations 
    projects.loc[index, 'ratio3'] = classes / dependencies
    projects.loc[index, 'ratio4'] = classes / unique_dependencies


corr = projects.copy()
corr = corr[corr['IRN'] < 10000] # Remove 3 huge outliers

corr_variables = ['ratio1', 'ratio2',  'CHM', 'CHD', 'IRN', 'IFN', 'CMQ', 'SMQ']
for var in corr_variables:
    corr[var] = corr[var].pct_change()
   

plt.scatter(corr['ratio2'], corr['IRN'])
plt.show()
    
corr_results = corr[corr_variables].corr(method='spearman')
corr_results.style.background_gradient(cmap='Blues')




## Group metrics by ratio of external method invocations
1. Attach method_invocations ratio to the corresponding project
2. Create groups of ranges of ratio
3. Boxplot of metrics for each range

In [None]:
for index, row in projects.iterrows():
        name = row['name'].replace('/','__')
        classes, method_declarations, method_invocations, dependencies, unique_dependencies = get_project_data(f"{project_stats_base_dir}/{name}")

# ratio4 represents ratio of classes / method_invocatoins @ cell 48 
stats = projects['ratio4'].describe()
# Range created according to quartiles of ratio4
ranges = [[stats['min'],stats['25%']], [stats['25%'],stats['50%']], [stats['50%'],stats['75%']], [stats['75%'], stats['max']]]
for r in ranges:
    projects_range = get_df_in_range(projects, 'ratio4', r[0], r[1])
    print(f"Range {r[0]} - {r[1]} - project count: {len(projects_range)}")
    projects_range.boxplot(column=['CHM', 'CHD', 'SMQ', 'CMQ'])
    plt.show()
    projects_range.boxplot(column=['IFN'])
    plt.show()
    


In [None]:
projects[projects['IRN'] < 10000]


In [None]:
corr['IRN'].hist(bins=25)
corr['ratio2'].hist(bins=25)