In [18]:
import sys, os, time
import numpy as np
from scipy import stats
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utility import *

In [19]:
df_list = []
data_folder_path = '../output'
for file in list_files(data_folder_path, all=False, extension = 'csv'):
    tmp_df = pd.read_csv(f'{data_folder_path}/{file}')
    df_list.append(tmp_df)
df = pd.concat(df_list, ignore_index = True)

df['Python'] = df['Python'].astype(str)
df.round({'Score': 2})

Unnamed: 0,OS,Python,Hardware,Run,Score,Processing_Time,Project
0,Linux-Xenial,3.8,amd64,1.0,1.00,49.0,seglearn
1,Linux-Xenial,3.6,amd64,1.0,1.00,53.0,seglearn
2,Linux-Xenial,3.7,amd64,1.0,1.00,44.0,seglearn
3,Linux-Bionic,3.7,amd64,1.0,1.00,42.0,seglearn
4,Linux-Focal,3.7,amd64,1.0,1.00,44.0,seglearn
...,...,...,...,...,...,...,...
5995,Linux-Bionic,3.7,amd64,50.0,5.15,1188.0,delfi
5996,Linux-Focal,3.7,amd64,50.0,5.21,1240.0,delfi
5997,Linux-Xenial,3.7,arm64,50.0,5.14,1534.0,delfi
5998,MacOS,3.7,amd64,50.0,5.15,2978.0,delfi


In [20]:
def add_money_col(df):
    credit_list = []
    for idx, row in df.iterrows():
        if row['OS'].startswith("Linux"):
            credit_list.append(10*(row['Processing_Time']/60))
        elif row['OS'] == 'MacOS':
            credit_list.append(50*(row['Processing_Time']/60))
        else:
            credit_list.append(20*(row['Processing_Time']/60))

    df['Credits'] = credit_list
    return df

In [21]:
df = add_money_col(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   OS               6000 non-null   object 
 1   Python           6000 non-null   object 
 2   Hardware         6000 non-null   object 
 3   Run              6000 non-null   float64
 4   Score            6000 non-null   float64
 5   Processing_Time  6000 non-null   float64
 6   Project          6000 non-null   object 
 7   Credits          6000 non-null   float64
dtypes: float64(4), object(4)
memory usage: 375.1+ KB


In [22]:
df.head(20)

Unnamed: 0,OS,Python,Hardware,Run,Score,Processing_Time,Project,Credits
0,Linux-Xenial,3.8,amd64,1.0,0.996967,49.0,seglearn,8.166667
1,Linux-Xenial,3.6,amd64,1.0,0.996967,53.0,seglearn,8.833333
2,Linux-Xenial,3.7,amd64,1.0,0.996967,44.0,seglearn,7.333333
3,Linux-Bionic,3.7,amd64,1.0,0.996967,42.0,seglearn,7.0
4,Linux-Focal,3.7,amd64,1.0,0.996967,44.0,seglearn,7.333333
5,Linux-Xenial,3.7,arm64,1.0,0.996967,91.0,seglearn,15.166667
6,MacOS,3.7,amd64,1.0,0.996967,93.0,seglearn,77.5
7,Windows,3.7,amd64,1.0,0.996967,106.0,seglearn,35.333333
8,Linux-Xenial,3.8,amd64,2.0,0.996967,45.0,seglearn,7.5
9,Linux-Xenial,3.6,amd64,2.0,0.996967,52.0,seglearn,8.666667


In [23]:
df['Expense'] = [100 * 0.0006 * c for c in df['Credits'].tolist()]

In [24]:
df.head(20)

Unnamed: 0,OS,Python,Hardware,Run,Score,Processing_Time,Project,Credits,Expense
0,Linux-Xenial,3.8,amd64,1.0,0.996967,49.0,seglearn,8.166667,0.49
1,Linux-Xenial,3.6,amd64,1.0,0.996967,53.0,seglearn,8.833333,0.53
2,Linux-Xenial,3.7,amd64,1.0,0.996967,44.0,seglearn,7.333333,0.44
3,Linux-Bionic,3.7,amd64,1.0,0.996967,42.0,seglearn,7.0,0.42
4,Linux-Focal,3.7,amd64,1.0,0.996967,44.0,seglearn,7.333333,0.44
5,Linux-Xenial,3.7,arm64,1.0,0.996967,91.0,seglearn,15.166667,0.91
6,MacOS,3.7,amd64,1.0,0.996967,93.0,seglearn,77.5,4.65
7,Windows,3.7,amd64,1.0,0.996967,106.0,seglearn,35.333333,2.12
8,Linux-Xenial,3.8,amd64,2.0,0.996967,45.0,seglearn,7.5,0.45
9,Linux-Xenial,3.6,amd64,2.0,0.996967,52.0,seglearn,8.666667,0.52


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   OS               6000 non-null   object 
 1   Python           6000 non-null   object 
 2   Hardware         6000 non-null   object 
 3   Run              6000 non-null   float64
 4   Score            6000 non-null   float64
 5   Processing_Time  6000 non-null   float64
 6   Project          6000 non-null   object 
 7   Credits          6000 non-null   float64
 8   Expense          6000 non-null   float64
dtypes: float64(5), object(4)
memory usage: 422.0+ KB


In [26]:
def project_wise_analysis(proj_df):

    ### OS
    sub_df_os = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                             (proj_df['Python'] == '3.7') &
                             (proj_df['Hardware'] == 'amd64')) |
                            (proj_df['OS'] == 'MacOS') |
                            (proj_df['OS'] == 'Windows')]
    
    comp_df_OS = pd.DataFrame(sub_df_os.groupby('OS')['Expense'].mean())
    comp_df_OS['pct_change'] = [
        calculate_pct_diff(comp_df_OS.loc['Linux-Xenial'][0], comp_df_OS.loc['Linux-Xenial'][0]),
        calculate_pct_diff(comp_df_OS.loc['Linux-Xenial'][0], comp_df_OS.loc['MacOS'][0]),
        calculate_pct_diff(comp_df_OS.loc['Linux-Xenial'][0], comp_df_OS.loc['Windows'][0])
    ]

    comp_df_OS['p-value'] = [1.0,
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Expense'], proj_df.loc[proj_df['OS'] == 'MacOS']['Expense']),
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Expense'], proj_df.loc[proj_df['OS'] == 'Windows']['Expense'])
                         ]
    
    ### Dist

    sub_df_dist = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                               (proj_df['Python'] == '3.7') &
                               (proj_df['Hardware'] == 'amd64')) |
                              (proj_df['OS'] == 'Linux-Bionic') |
                              (proj_df['OS'] == 'Linux-Focal')]
    
    comp_df_dist = pd.DataFrame(sub_df_dist.groupby('OS')['Expense'].mean())
    
    comp_df_dist['pct_change'] = [
        calculate_pct_diff(comp_df_dist.loc['Linux-Xenial'][0], comp_df_dist.loc['Linux-Bionic'][0]),
        calculate_pct_diff(comp_df_dist.loc['Linux-Xenial'][0], comp_df_dist.loc['Linux-Focal'][0]),
        calculate_pct_diff(comp_df_dist.loc['Linux-Xenial'][0], comp_df_dist.loc['Linux-Xenial'][0])
    ]

    comp_df_dist['p-value'] = [
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Expense'], proj_df.loc[proj_df['OS'] == 'Linux-Bionic']['Expense']),
                          extract_p_value(proj_df.loc[proj_df['OS'] == 'Linux-Xenial']['Expense'], proj_df.loc[proj_df['OS'] == 'Linux-Focal']['Expense']), 1.0
                         ]
    
    
    
    
    
    ### Hardware

    sub_df_hw = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                             (proj_df['Python'] == '3.7') &
                             (proj_df['Hardware'] == 'amd64')) |
                            (proj_df['Hardware'] == 'arm64')]
    
    
    comp_df_hw = pd.DataFrame(sub_df_hw.groupby('Hardware')['Expense'].mean())

    #print(comp_df_hw)
    
    comp_df_hw['pct_change'] = [
        calculate_pct_diff(comp_df_hw.loc['amd64'][0], comp_df_hw.loc['amd64'][0]),
        calculate_pct_diff(comp_df_hw.loc['amd64'][0], comp_df_hw.loc['arm64'][0])
    ]

    comp_df_hw['p-value'] = [1.0,
                          extract_p_value(proj_df.loc[proj_df['Hardware'] == 'amd64']['Expense'], proj_df.loc[proj_df['Hardware'] == 'arm64']['Expense'])
                         ]

    ### Python Versions

    sub_df_py = proj_df.loc[((proj_df['OS'] == 'Linux-Xenial') &
                             (proj_df['Python'] == '3.7') &
                             (proj_df['Hardware'] == 'amd64')) |
                            (proj_df['Python'] == '3.8') |
                            (proj_df['Python'] == '3.6')]

    comp_df_py = pd.DataFrame(sub_df_py.groupby('Python')['Expense'].mean())

    #print(comp_df_py)
    
    comp_df_py['pct_change'] = [
        calculate_pct_diff(comp_df_py.loc['3.7'][0], comp_df_py.loc['3.6'][0]),
        calculate_pct_diff(comp_df_py.loc['3.7'][0], comp_df_py.loc['3.7'][0]),
        calculate_pct_diff(comp_df_py.loc['3.7'][0], comp_df_py.loc['3.8'][0])
    ]

    #print(comp_df_py)

    comp_df_py['p-value'] = [
        extract_p_value(proj_df.loc[proj_df['Python'] == '3.7']['Expense'], proj_df.loc[proj_df['Python'] == '3.6']['Expense']),
        1.0,
                          extract_p_value(proj_df.loc[proj_df['Python'] == '3.7']['Expense'], proj_df.loc[proj_df['Python'] == '3.8']['Expense'])
                         ]
    
    
    
    return comp_df_OS, comp_df_dist, comp_df_hw, comp_df_py

In [37]:
data_list_os = []
data_list_hw = []
data_list_py = []
data_list_dist = []
for proj in df['Project'].unique().tolist():
    print(proj)
    comp_df_OS, comp_df_dist, comp_df_hw, comp_df_py = project_wise_analysis(df.loc[df['Project'] == proj])
    
    # print("OS:")
    # print(comp_df_OS)
    # data_list_os.append([proj, comp_df_OS.loc['MacOS']['pct_change'], 'MacOS'])
    # data_list_os.append([proj, comp_df_OS.loc['Windows']['pct_change'], 'Windows'])
    # print()
    
    # print("CPU:")
    # print(comp_df_hw)
    # data_list_hw.append([proj, comp_df_hw.loc['amd64']['pct_change'], 'AMD64'])
    # data_list_hw.append([proj, comp_df_hw.loc['arm64']['pct_change'], 'ARM64'])
    # print()
    
    print("Distributions:")
    print(comp_df_dist)
    data_list_dist.append([proj, comp_df_dist.loc['Linux-Bionic']['pct_change'], 'Linux-Bionic'])
    data_list_dist.append([proj, comp_df_dist.loc['Linux-Focal']['pct_change'], 'Linux-Focal'])
    print()
    
    # print("Python:")
    # print(comp_df_py)
    # data_list_py.append([proj, comp_df_py.loc['3.6']['pct_change'], '3.6'])
    # data_list_py.append([proj, comp_df_py.loc['3.8']['pct_change'], '3.8'])
    print("\n\n\n")

seglearn
Distributions:
              Expense  pct_change   p-value
OS                                         
Linux-Bionic   0.4468    0.903342  0.000058
Linux-Focal    0.4422   -0.135501  0.000035
Linux-Xenial   0.4428    0.000000  1.000000





tfdiffeq
Distributions:
              Expense  pct_change       p-value
OS                                             
Linux-Bionic   2.2596   -4.018350  2.173269e-20
Linux-Focal    2.2962   -2.463682  2.552405e-17
Linux-Xenial   2.3542    0.000000  1.000000e+00





pescador
Distributions:
              Expense  pct_change       p-value
OS                                             
Linux-Bionic   9.9192    1.296951  1.147890e-09
Linux-Focal   10.0532    2.665387  4.889664e-09
Linux-Xenial   9.7922    0.000000  1.000000e+00





entropy
Distributions:
              Expense  pct_change       p-value
OS                                             
Linux-Bionic   0.5576  -13.790971  3.037847e-46
Linux-Focal    0.5668  -12.368584  8.013583e-4

In [32]:
pd.DataFrame(data_list_hw, columns=['project', 'pct_change', 'CPU Architecture']).describe() #.to_csv('pct_change_exp_hw.csv')

Unnamed: 0,pct_change
count,30.0
mean,20.689254
std,44.543815
min,-11.000774
25%,0.0
50%,0.0
75%,16.602117
max,190.758874


In [33]:
pd.DataFrame(data_list_os, columns=['project', 'pct_change', 'Operating System']).describe() #.to_csv('pct_change_exp_os.csv')

Unnamed: 0,pct_change
count,30.0
mean,725.434531
std,1033.615462
min,104.049523
25%,219.872482
50%,423.42121
75%,937.615453
max,5827.570232


In [34]:
pd.DataFrame(data_list_dist, columns=['project', 'pct_change', 'Linux Distribution']).describe() #.to_csv('pct_change_exp_dist.csv')

Unnamed: 0,pct_change
count,30.0
mean,-1.191605
std,4.85115
min,-13.81323
25%,-2.182593
50%,-0.265874
75%,1.220679
max,7.604627


In [35]:
pd.DataFrame(data_list_py, columns=['project', 'pct_change', 'Python Version']).describe() #.to_csv('pct_change_exp_py.csv')

Unnamed: 0,pct_change
count,30.0
mean,27.248448
std,90.380799
min,-13.580088
25%,0.737232
50%,5.435665
75%,16.106421
max,495.379224
