In [1]:
import sys, os, time
import numpy as np
from scipy import stats
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utility import *

In [2]:
df_list = []
data_folder_path = '../output'
for file in list_files(data_folder_path, all=False, extension = 'csv'):
    tmp_df = pd.read_csv(f'{data_folder_path}/{file}')
    df_list.append(tmp_df)
df = pd.concat(df_list, ignore_index = True)

df['Python'] = df['Python'].astype(str)
df.round({'Score': 2})

Unnamed: 0,OS,Python,Hardware,Run,Score,Processing_Time,Project
0,Linux-Xenial,3.8,amd64,1.0,1.00,49.0,seglearn
1,Linux-Xenial,3.6,amd64,1.0,1.00,53.0,seglearn
2,Linux-Xenial,3.7,amd64,1.0,1.00,44.0,seglearn
3,Linux-Bionic,3.7,amd64,1.0,1.00,42.0,seglearn
4,Linux-Focal,3.7,amd64,1.0,1.00,44.0,seglearn
...,...,...,...,...,...,...,...
5995,Linux-Bionic,3.7,amd64,50.0,5.15,1188.0,delfi
5996,Linux-Focal,3.7,amd64,50.0,5.21,1240.0,delfi
5997,Linux-Xenial,3.7,arm64,50.0,5.14,1534.0,delfi
5998,MacOS,3.7,amd64,50.0,5.15,2978.0,delfi


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   OS               6000 non-null   object 
 1   Python           6000 non-null   object 
 2   Hardware         6000 non-null   object 
 3   Run              6000 non-null   float64
 4   Score            6000 non-null   float64
 5   Processing_Time  6000 non-null   float64
 6   Project          6000 non-null   object 
dtypes: float64(3), object(4)
memory usage: 328.2+ KB


In [4]:
def project_wise_analysis(proj_df):

    ### OS
    sub_df_os = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                             (proj_df['Python'] == '3.7') &
                             (proj_df['Hardware'] == 'amd64')) |
                            (proj_df['OS'] == 'MacOS') |
                            (proj_df['OS'] == 'Windows')]
    
    comp_df_OS = pd.DataFrame(sub_df_os.groupby('OS')['Processing_Time'].mean())
    comp_df_OS['pct_change'] = [
        calculate_pct_diff(comp_df_OS.loc['Linux-Xenial'][0], comp_df_OS.loc['Linux-Xenial'][0]),
        calculate_pct_diff(comp_df_OS.loc['Linux-Xenial'][0], comp_df_OS.loc['MacOS'][0]),
        calculate_pct_diff(comp_df_OS.loc['Linux-Xenial'][0], comp_df_OS.loc['Windows'][0])
    ]

    comp_df_OS['p-value'] = [1.0,
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Processing_Time'], proj_df.loc[proj_df['OS'] == 'MacOS']['Processing_Time']),
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Processing_Time'], proj_df.loc[proj_df['OS'] == 'Windows']['Processing_Time'])
                         ]
    
    ### Dist

    sub_df_dist = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                               (proj_df['Python'] == '3.7') &
                               (proj_df['Hardware'] == 'amd64')) |
                              (proj_df['OS'] == 'Linux-Bionic') |
                              (proj_df['OS'] == 'Linux-Focal')]
    
    comp_df_dist = pd.DataFrame(sub_df_dist.groupby('OS')['Processing_Time'].mean())
    
    comp_df_dist['pct_change'] = [
        calculate_pct_diff(comp_df_dist.loc['Linux-Xenial'][0], comp_df_dist.loc['Linux-Bionic'][0]),
        calculate_pct_diff(comp_df_dist.loc['Linux-Xenial'][0], comp_df_dist.loc['Linux-Focal'][0]),
        calculate_pct_diff(comp_df_dist.loc['Linux-Xenial'][0], comp_df_dist.loc['Linux-Xenial'][0])
    ]

    comp_df_dist['p-value'] = [
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Processing_Time'], proj_df.loc[proj_df['OS'] == 'Linux-Bionic']['Processing_Time']),
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Processing_Time'], proj_df.loc[proj_df['OS'] == 'Linux-Focal']['Processing_Time']), 1.0
                         ]
    
    
    
    
    
    ### Hardware

    sub_df_hw = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                             (proj_df['Python'] == '3.7') &
                             (proj_df['Hardware'] == 'amd64')) |
                            (proj_df['Hardware'] == 'arm64')]
    
    
    comp_df_hw = pd.DataFrame(sub_df_hw.groupby('Hardware')['Processing_Time'].mean())

    #print(comp_df_hw)
    
    comp_df_hw['pct_change'] = [
        calculate_pct_diff(comp_df_hw.loc['amd64'][0], comp_df_hw.loc['amd64'][0]),
        calculate_pct_diff(comp_df_hw.loc['amd64'][0], comp_df_hw.loc['arm64'][0])
    ]

    comp_df_hw['p-value'] = [1.0,
                          extract_p_value(proj_df.loc[proj_df['Hardware'] == 'amd64']['Processing_Time'], proj_df.loc[proj_df['Hardware'] == 'arm64']['Processing_Time'])
                         ]

    ### Python Versions

    sub_df_py = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                             (proj_df['Python'] == '3.7') &
                             (proj_df['Hardware'] == 'amd64')) |
                            (proj_df['Python'] == '3.8') |
                            (proj_df['Python'] == '3.6')]

    comp_df_py = pd.DataFrame(sub_df_py.groupby('Python')['Processing_Time'].mean())

    #print(comp_df_py)
    
    comp_df_py['pct_change'] = [
        calculate_pct_diff(comp_df_py.loc['3.7'][0], comp_df_py.loc['3.6'][0]),
        calculate_pct_diff(comp_df_py.loc['3.7'][0], comp_df_py.loc['3.7'][0]),
        calculate_pct_diff(comp_df_py.loc['3.7'][0], comp_df_py.loc['3.8'][0])
    ]

    #print(comp_df_py)

    comp_df_py['p-value'] = [
        extract_p_value(proj_df.loc[proj_df['Python'] == '3.7']['Processing_Time'], proj_df.loc[proj_df['Python'] == '3.6']['Processing_Time']),
        1.0,
                          extract_p_value(proj_df.loc[proj_df['Python'] == '3.7']['Processing_Time'], proj_df.loc[proj_df['Python'] == '3.8']['Processing_Time'])
                         ]
    
    
    
    return comp_df_OS, comp_df_dist, comp_df_hw, comp_df_py

In [20]:
data_list_os = []
data_list_hw = []
data_list_py = []
data_list_dist = []
for proj in df['Project'].unique().tolist():
    print(proj)
    comp_df_OS, comp_df_dist, comp_df_hw, comp_df_py = project_wise_analysis(df.loc[df['Project'] == proj])
    
    # print("OS:")
    # print(comp_df_OS)
    # data_list_os.append([proj, comp_df_OS.loc['MacOS']['pct_change'], 'MacOS'])
    # data_list_os.append([proj, comp_df_OS.loc['Windows']['pct_change'], 'Windows'])
    # print()
    
    print("CPU:")
    print(comp_df_hw)
    data_list_hw.append([proj, comp_df_hw.loc['amd64']['pct_change'], 'AMD64'])
    data_list_hw.append([proj, comp_df_hw.loc['arm64']['pct_change'], 'ARM64'])
    print()
    
    # print("Distributions:")
    # print(comp_df_dist)
    # data_list_dist.append([proj, comp_df_dist.loc['Linux-Bionic']['pct_change'], 'Linux-Bionic'])
    # data_list_dist.append([proj, comp_df_dist.loc['Linux-Focal']['pct_change'], 'Linux-Focal'])
    # print()
    
    # print("Python:")
    # print(comp_df_py)
    # data_list_py.append([proj, comp_df_py.loc['3.6']['pct_change'], '3.6'])
    # data_list_py.append([proj, comp_df_py.loc['3.8']['pct_change'], '3.8'])
    print("\n\n\n")

seglearn
CPU:
          Processing_Time  pct_change       p-value
Hardware                                           
amd64               44.28    0.000000  1.000000e+00
arm64               98.22  121.815718  1.153803e-20





tfdiffeq
CPU:
          Processing_Time  pct_change   p-value
Hardware                                       
amd64              235.42    0.000000  1.000000
arm64              278.10   18.129301  0.007117





pescador
CPU:
          Processing_Time  pct_change   p-value
Hardware                                       
amd64              979.22    0.000000  1.000000
arm64             1592.68   62.647822  0.954169





entropy
CPU:
          Processing_Time  pct_change   p-value
Hardware                                       
amd64               64.68    0.000000  1.000000
arm64               71.78   10.977118  0.007198





setka
CPU:
          Processing_Time  pct_change   p-value
Hardware                                       
amd64              336.74    0.000

In [10]:
pd.DataFrame(data_list_hw, columns=['project', 'pct_change', 'CPU Architecture']).describe() #.to_csv('pct_change_pt_hw.csv')

Unnamed: 0,pct_change
count,30.0
mean,20.689254
std,44.543815
min,-11.000774
25%,0.0
50%,0.0
75%,16.602117
max,190.758874


In [11]:
pd.DataFrame(data_list_os, columns=['project', 'pct_change', 'Operating System']).describe() #.to_csv('pct_change_pt_os.csv')

Unnamed: 0,pct_change
count,30.0
mean,118.624458
std,191.606601
min,-0.554552
25%,36.802899
50%,78.595511
75%,133.655697
max,1085.514046


In [12]:
pd.DataFrame(data_list_dist, columns=['project', 'pct_change', 'Linux Distribution']).describe() #.to_csv('pct_change_pt_dist.csv')

Unnamed: 0,pct_change
count,30.0
mean,-1.191605
std,4.85115
min,-13.81323
25%,-2.182593
50%,-0.265874
75%,1.220679
max,7.604627


In [13]:
pd.DataFrame(data_list_py, columns=['project', 'pct_change', 'Python Version']).describe() #.to_csv('pct_change_pt_py.csv')

Unnamed: 0,pct_change
count,30.0
mean,27.248448
std,90.380799
min,-13.580088
25%,0.737232
50%,5.435665
75%,16.106421
max,495.379224
