In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
currDir = os.path.dirname(os.path.realpath("__file__"))
rootDir = os.path.abspath(os.path.join(currDir, '..'))
sys.path.insert(1, rootDir)

In [3]:
import numpy as np
import pandas as pd
from src.utils.file_utils import load_df, save_df, load_model

In [4]:
ds1 = load_df("dataset1.csv")
ds2 = load_df("dataset2.csv")

In [5]:
ds1.describe()

Unnamed: 0,mode,ei,to,td,tf,vers
count,2600000.0,2600000.0,2600000.0,2600000.0,2600000.0,2600000.0
mean,11.6,1.854442,267.7224,71.555,12.16666,4.5
std,8.002501,3.186181,1555.065,47.02838,11.25711,2.872282
min,5.0,0.347982,1.0,0.1,1.0,0.0
25%,5.0,0.933,3.0,34.3,3.968246,2.0
50%,6.5,1.354955,5.0,68.5,9.0,4.5
75%,21.0,1.354955,55.0,102.7,17.0,7.0
max,26.0,23.17735,14400.0,260.0,60.0,9.0


In [6]:
ds2.describe()

Unnamed: 0,mode,speed,fe,tt,vers
count,504000.0,504000.0,504000.0,504000.0,504000.0
mean,2.5,65.260247,0.952462,729.525361,3.0
std,1.707827,192.80363,0.308689,2322.622952,2.000002
min,0.0,2.764055,0.347982,0.0,0.0
25%,1.0,7.64,0.7872,17.1675,1.0
50%,2.5,14.335,0.887243,58.465,3.0
75%,4.0,34.13,1.21,99.9925,5.0
max,5.0,1164.8569,1.8898,14397.6001,6.0


In [7]:
def probGroundTruth(ds, feats, featsQuarts, numVers):
    
    total = len(ds.index)
    rows = []
    namesCol = []
    for feat in feats:
        namesCol.append(feat + "-Q1")
        namesCol.append(feat + "-Q2")
        namesCol.append(feat + "-Q3")
        namesCol.append(feat + "-Q4")
        rows.append(np.zeros(numVers))
        rows.append(np.zeros(numVers))
        rows.append(np.zeros(numVers))
        rows.append(np.zeros(numVers))
    
    for index,row in ds.iterrows():
        rowIndex = 0
        for i, feat in enumerate(feats):
            qs = dict()
            if row[feat] <= featsQuarts[i][0]:
                index = rowIndex
            elif row[feat] <= featsQuarts[i][1]:
                index = rowIndex + 1
            elif row[feat] <= featsQuarts[i][2]:
                index = rowIndex + 2
            else:
                index = rowIndex + 3
            rows[index][int(row["vers"])] += 1
            rowIndex += 4
    data = []
    for row in rows:
        dataRow = []
        for val in row:
            dataRow.append(val / total)
        data.append(dataRow)
    data = pd.DataFrame(data, columns=range(numVers), index=namesCol)
    return data
        

In [8]:
feats1 = ["mode", "ei", "to", "td", "tf"]
feats1Quarts = []
feats1Quarts.append([5,6.5,22]) #mode
feats1Quarts.append([0.933,1.354955,26]) #ei (only 3 buckets because the 50% quartile is the same as the 75% quartile)
feats1Quarts.append([3,5,55]) #to
feats1Quarts.append([34.3,68.5,102.7]) #td
feats1Quarts.append([3.968246,9,17,]) #tf

feats2 = ["mode", "speed", "fe", "tt"]
feats2Quarts = []
feats2Quarts.append([1,2.5,4]) #mode
feats2Quarts.append([7.64,14.335,34.13]) #speed)
feats2Quarts.append([0.7872,0.887243,1.21]) #fe
feats2Quarts.append([17.17675,58.465,99.9925]) #tt




In [9]:
probs1 = probGroundTruth(ds1,feats1,feats1Quarts,10)

In [11]:
print(probs1)

                0         1         2         3         4         5         6  \
mode-Q1  0.050000  0.050000  0.050000  0.050000  0.050000  0.050000  0.050000   
mode-Q2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
mode-Q3  0.040000  0.040000  0.040000  0.040000  0.040000  0.040000  0.040000   
mode-Q4  0.010000  0.010000  0.010000  0.010000  0.010000  0.010000  0.010000   
ei-Q1    0.030000  0.030000  0.020000  0.030000  0.030000  0.030000  0.030000   
ei-Q2    0.060000  0.060000  0.070000  0.060000  0.000000  0.060000  0.060000   
ei-Q3    0.010000  0.010000  0.010000  0.010000  0.070000  0.010000  0.010000   
ei-Q4    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
to-Q1    0.040000  0.010000  0.040000  0.040000  0.040000  0.040000  0.010000   
to-Q2    0.020000  0.000000  0.020000  0.020000  0.020000  0.020000  0.010000   
to-Q3    0.020000  0.050000  0.020000  0.020000  0.020000  0.020000  0.040000   
to-Q4    0.020000  0.040000 

In [10]:
probs2 = probGroundTruth(ds2,feats2,feats2Quarts,7)

In [12]:
print(probs2)

                 0         1         2         3         4         5         6
mode-Q1   0.047619  0.047619  0.047619  0.047619  0.047619  0.047619  0.047619
mode-Q2   0.023810  0.023810  0.023810  0.023810  0.023810  0.023810  0.023810
mode-Q3   0.047619  0.047619  0.047619  0.047619  0.047619  0.047619  0.047619
mode-Q4   0.023810  0.023810  0.023810  0.023810  0.023810  0.023810  0.023810
speed-Q1  0.047619  0.047619  0.000000  0.047619  0.047619  0.142857  0.047619
speed-Q2  0.023810  0.023810  0.000000  0.023810  0.023810  0.000000  0.023810
speed-Q3  0.071429  0.071429  0.000000  0.071429  0.071429  0.000000  0.071429
speed-Q4  0.000000  0.000000  0.142857  0.000000  0.000000  0.000000  0.000000
fe-Q1     0.071429  0.071429  0.071429  0.071429  0.071429  0.071429  0.023810
fe-Q2     0.000000  0.000000  0.000000  0.023810  0.000000  0.000000  0.047619
fe-Q3     0.047619  0.047619  0.047619  0.000000  0.047619  0.047619  0.071429
fe-Q4     0.023810  0.023810  0.023810  0.047619  0.

In [13]:
save_df(probs1, "ground-truth-lime-1.csv")
save_df(probs2, "ground-truth-lime-2.csv")

df successfully saved | filename: ground-truth-lime-1.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data
df successfully saved | filename: ground-truth-lime-2.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data
