In [1]:
import sys, os, time
import numpy as np
from scipy import stats
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utility import *

In [2]:
df_list = []
data_folder_path = '../output'
for file in list_files(data_folder_path, all=False, extension = 'csv'):
    tmp_df = pd.read_csv(f'{data_folder_path}/{file}')
    df_list.append(tmp_df)
df = pd.concat(df_list, ignore_index = True)

df['Python'] = df['Python'].astype(str)
df.round({'Score': 2})

Unnamed: 0,OS,Python,Hardware,Run,Score,Processing_Time,Project
0,Linux-Xenial,3.8,amd64,1.0,1.00,49.0,seglearn
1,Linux-Xenial,3.6,amd64,1.0,1.00,53.0,seglearn
2,Linux-Xenial,3.7,amd64,1.0,1.00,44.0,seglearn
3,Linux-Bionic,3.7,amd64,1.0,1.00,42.0,seglearn
4,Linux-Focal,3.7,amd64,1.0,1.00,44.0,seglearn
...,...,...,...,...,...,...,...
10395,Linux-Bionic,3.7,amd64,50.0,5.15,1188.0,delfi
10396,Linux-Focal,3.7,amd64,50.0,5.21,1240.0,delfi
10397,Linux-Xenial,3.7,arm64,50.0,5.14,1534.0,delfi
10398,MacOS,3.7,amd64,50.0,5.15,2978.0,delfi


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10400 entries, 0 to 10399
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   OS               10400 non-null  object 
 1   Python           10400 non-null  object 
 2   Hardware         10400 non-null  object 
 3   Run              10400 non-null  float64
 4   Score            10400 non-null  float64
 5   Processing_Time  10400 non-null  float64
 6   Project          10400 non-null  object 
dtypes: float64(3), object(4)
memory usage: 568.9+ KB


In [4]:
def project_wise_analysis(proj_df):

    ### OS
    sub_df_os = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                             (proj_df['Python'] == '3.7') &
                             (proj_df['Hardware'] == 'amd64')) |
                            (proj_df['OS'] == 'MacOS') |
                            (proj_df['OS'] == 'Windows')]
    
    comp_df_OS = pd.DataFrame(sub_df_os.groupby('OS')['Score'].mean())
    comp_df_OS['pct_change'] = [
        calculate_pct_diff(comp_df_OS.loc['Linux-Xenial'][0], comp_df_OS.loc['Linux-Xenial'][0]),
        calculate_pct_diff(comp_df_OS.loc['Linux-Xenial'][0], comp_df_OS.loc['MacOS'][0]),
        calculate_pct_diff(comp_df_OS.loc['Linux-Xenial'][0], comp_df_OS.loc['Windows'][0])
    ]

    comp_df_OS['p-value'] = [1.0,
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Score'], proj_df.loc[proj_df['OS'] == 'MacOS']['Score']),
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Score'], proj_df.loc[proj_df['OS'] == 'Windows']['Score'])
                         ]
    
    ### Dist

    sub_df_dist = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                               (proj_df['Python'] == '3.7') &
                               (proj_df['Hardware'] == 'amd64')) |
                              (proj_df['OS'] == 'Linux-Bionic') |
                              (proj_df['OS'] == 'Linux-Focal')]
    
    comp_df_dist = pd.DataFrame(sub_df_dist.groupby('OS')['Score'].mean())
    
    comp_df_dist['pct_change'] = [
        calculate_pct_diff(comp_df_dist.loc['Linux-Xenial'][0], comp_df_dist.loc['Linux-Bionic'][0]),
        calculate_pct_diff(comp_df_dist.loc['Linux-Xenial'][0], comp_df_dist.loc['Linux-Focal'][0]),
        calculate_pct_diff(comp_df_dist.loc['Linux-Xenial'][0], comp_df_dist.loc['Linux-Xenial'][0])
    ]

    comp_df_dist['p-value'] = [
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Score'], proj_df.loc[proj_df['OS'] == 'Linux-Bionic']['Score']),
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Score'], proj_df.loc[proj_df['OS'] == 'Linux-Focal']['Score']), 1.0
                         ]
    
    
    
    
    
    ### Hardware

    sub_df_hw = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                             (proj_df['Python'] == '3.7') &
                             (proj_df['Hardware'] == 'amd64')) |
                            (proj_df['Hardware'] == 'arm64')]
    
    
    comp_df_hw = pd.DataFrame(sub_df_hw.groupby('Hardware')['Score'].mean())

    #print(comp_df_hw)
    
    comp_df_hw['pct_change'] = [
        calculate_pct_diff(comp_df_hw.loc['amd64'][0], comp_df_hw.loc['amd64'][0]),
        calculate_pct_diff(comp_df_hw.loc['amd64'][0], comp_df_hw.loc['arm64'][0])
    ]

    comp_df_hw['p-value'] = [1.0,
                          extract_p_value(proj_df.loc[proj_df['Hardware'] == 'amd64']['Score'], proj_df.loc[proj_df['Hardware'] == 'arm64']['Score'])
                         ]

    ### Python Versions

    sub_df_py = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                             (proj_df['Python'] == '3.7') &
                             (proj_df['Hardware'] == 'amd64')) |
                            (proj_df['Python'] == '3.8') |
                            (proj_df['Python'] == '3.6')]

    comp_df_py = pd.DataFrame(sub_df_py.groupby('Python')['Score'].mean())

    #print(comp_df_py)
    
    comp_df_py['pct_change'] = [
        calculate_pct_diff(comp_df_py.loc['3.7'][0], comp_df_py.loc['3.6'][0]),
        calculate_pct_diff(comp_df_py.loc['3.7'][0], comp_df_py.loc['3.7'][0]),
        calculate_pct_diff(comp_df_py.loc['3.7'][0], comp_df_py.loc['3.8'][0])
    ]

    #print(comp_df_py)

    comp_df_py['p-value'] = [
        extract_p_value(proj_df.loc[proj_df['Python'] == '3.7']['Score'], proj_df.loc[proj_df['Python'] == '3.6']['Score']),
        1.0,
                          extract_p_value(proj_df.loc[proj_df['Python'] == '3.7']['Score'], proj_df.loc[proj_df['Python'] == '3.8']['Score'])
                         ]
    
    
    
    return comp_df_OS, comp_df_dist, comp_df_hw, comp_df_py

In [5]:
data_list = []
for proj in df['Project'].unique().tolist():
    print(proj)
    comp_df_OS, comp_df_dist, comp_df_hw, comp_df_py = project_wise_analysis(df.loc[df['Project'] == proj])
    print("CPU:")
    print(comp_df_hw)
    data_list.append([proj, comp_df_hw.loc['amd64']['pct_change'], 'AMD64'])
    data_list.append([proj, comp_df_hw.loc['arm64']['pct_change'], 'ARM64'])
    # print()
    # print("Distributions:")
    # print(comp_df_dist)
    # print()
    # print("Hardware:")
    # print(comp_df_hw)
    # print()
    # print("Python:")
    # print(comp_df_py)
    print("\n\n\n")

seglearn
CPU:
             Score  pct_change  p-value
Hardware                               
amd64     0.996967         0.0      1.0
arm64     0.996967         0.0      1.0




netharn 
CPU:
             Score  pct_change   p-value
Hardware                                
amd64     0.046252    0.000000  1.000000
arm64     0.045660   -1.279945  0.419656




Feature-Selection 
CPU:
             Score  pct_change   p-value
Hardware                                
amd64     0.802138    0.000000  1.000000
arm64     0.802672    0.066512  0.101211




tfdiffeq
CPU:
             Score  pct_change   p-value
Hardware                                
amd64     0.979244    0.000000  1.000000
arm64     0.976896   -0.239777  0.929154




pescador
CPU:
             Score  pct_change   p-value
Hardware                                
amd64     0.854950    0.000000  1.000000
arm64     0.853868   -0.126557  0.877417




entropy
CPU:
             Score  pct_change   p-value
Hardware                      

  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue


CPU:
             Score  pct_change  p-value
Hardware                               
amd64     0.109994    0.000000  1.00000
arm64     0.109945   -0.044584  0.82433




sherpa
CPU:
             Score  pct_change   p-value
Hardware                                
amd64     0.994759    0.000000  1.000000
arm64     0.994918    0.016022  0.195063




pyalcs
CPU:
             Score  pct_change  p-value
Hardware                               
amd64     0.097848    0.000000  1.00000
arm64     0.097036   -0.830268  0.72038




scitime 
CPU:
             Score  pct_change       p-value
Hardware                                    
amd64     0.283333    0.000000  1.000000e+00
arm64     0.200000  -29.411682  1.541215e-82




doc2vec
CPU:
             Score  pct_change   p-value
Hardware                                
amd64     0.849329    0.000000  1.000000
arm64     0.850225    0.105552  0.142942




ivis
CPU:
           Score  pct_change       p-value
Hardware                                  


  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, ctrl).pvalue
  p_value = stats.ttest_ind(trt, c

In [6]:
pd.DataFrame(data_list, columns=['project', 'pct_change', 'CPU Architecture']).to_csv('pct_change_hw.csv')