In [34]:
import os
import pandas as pd
import git
import matplotlib.pyplot as plt
import folderstats
import numpy as np
import math

folder = "../../data/format-corpus"
relative_path = '../../data/format-corpus'
repo = git.Git(folder)

N_individual = 2044
p=0.005
p_meta = 0.8

In [35]:
def expected_operations(N,k,p):
    if(k==0.0):
        k=N
    number_of_pools = np.ceil(N/k)
    number_of_positive_pools = (1 - np.power((1-p),k)) * number_of_pools
    return np.ceil(number_of_pools + number_of_positive_pools*k)

def expected_writes(N,k,p):
    if(k==0.0):
        k=N
    number_of_pools = np.ceil(N/k)
    return number_of_pools

def expected_scrubbing(N,k,p):
    if(k==0.0):
        k=N
    number_of_pools = np.ceil(N/k)
    number_of_positive_pools = (1 - np.power((1-p),k))* number_of_pools
    return np.ceil(number_of_positive_pools*k)

def optimal_pool_size(N,p):
    if(p<=0.0):
        return N
    return np.ceil(1.24*np.power(p,-0.466))

In [36]:
p=0.005
p_meta = 0.8

In [37]:
df = folderstats.folderstats(folder, ignore_hidden=True,hash_name="sha256")
df = df[df.folder==False]
print(df.shape[0])
df["p"]=p
metadata = df.copy()
metadata["p"]=p_meta
metadata["name"]=metadata["name"]+".meta"
metadata["extension"]= "metadata"
df = pd.concat([df,metadata],ignore_index=True)
df["extension"].value_counts()


1560


metadata    1560
xml          986
pdf          106
md            74
mov           61
            ... 
sha1           1
qpw            1
123            1
opml           1
cdd            1
Name: extension, Length: 91, dtype: int64

In [38]:
s1 = df["extension"].value_counts().rename("N")
s2 = df.groupby(["extension"])["p"].mean()
df_ = pd.concat([s1, s2], axis=1)
df_.head()

Unnamed: 0,N,p
metadata,1560,0.8
xml,986,0.005
pdf,106,0.005
md,74,0.005
mov,61,0.005


In [39]:
def h(p,p_meta):
    df = folderstats.folderstats(folder, ignore_hidden=True,hash_name="sha256")
    df = df[df.folder==False]
    df["p"]=p
    metadata = df.copy()
    metadata["p"]=p_meta
    metadata["name"]=metadata["name"]+".meta"
    metadata["extension"]= "metadata"
    df = pd.concat([df,metadata],ignore_index=True)

    s1 = df["extension"].value_counts().rename("N")
    s2 = df.groupby(["extension"])["p"].mean()
    df_ = pd.concat([s1, s2], axis=1)
    df_["T"] = df_.apply(lambda row:  expected_operations(row.N,optimal_pool_size(row.N,row.p),row.p),axis=1)
    return expected_operations(N,k,p)

In [40]:
def efficiency(p,p_meta):
    df = folderstats.folderstats(folder, ignore_hidden=True,hash_name="sha256")
    df = df[df.folder==False]
    df["p"]=p
    metadata = df.copy()
    metadata["p"]=p_meta
    metadata["name"]=metadata["name"]+".meta"
    metadata["extension"]= "metadata"
    df = pd.concat([df,metadata],ignore_index=True)


    s1 = df["extension"].value_counts().rename("N")
    s2 = df.groupby(["extension"])["p"].mean()
    df_ = pd.concat([s1, s2], axis=1)
    df_["k"] = df_.apply(lambda row: optimal_pool_size(row.N,row.p),axis=1)
    df_["T(S_i)"] = df_.apply(lambda row:  expected_operations(row.N,1,row.p),axis=1)
    df_["TC(S_i)"] = df_.apply(lambda row:  expected_writes(row.N,1,row.p),axis=1)
    df_["T(S_cs)"] = df_.apply(lambda row:  expected_operations(row.N,row.k,row.p),axis=1)
    df_["TC(S_cs)"] = df_.apply(lambda row:  expected_writes(row.N,row.k,row.p),axis=1)


    efficiency_cs = df_["T(S_i)"].sum()/df_["T(S_cs)"].sum()
    cost_efficiency_cs = df_["TC(S_i)"].sum()/df_["TC(S_cs)"].sum()

    N=df_["N"].sum()
    p=df_["p"].mean()
    k=optimal_pool_size(N,p)
    efficiency_h = df_["T(S_i)"].sum()/expected_operations(N,k,p)
    cost_efficiency_h = df_["TC(S_i)"].sum()/expected_writes(N,k,p)

    return[efficiency_cs,cost_efficiency_cs,efficiency_h,cost_efficiency_h]

df_[:15]

Unnamed: 0,N,p
metadata,1560,0.8
xml,986,0.005
pdf,106,0.005
md,74,0.005
mov,61,0.005
java,47,0.005
zip,17,0.005
txt,14,0.005
doc,13,0.005
jp2,12,0.005


In [41]:
df = pd.DataFrame({"p":np.linspace(0.001,0.05,40),"p_meta":np.linspace(0.99,0.8,40)})
df["E(S_cs)"] = df.apply(lambda row: efficiency(row.p,row.p_meta)[0],axis=1)
df["C(S_cs)"] = df.apply(lambda row: efficiency(row.p,row.p_meta)[1],axis=1)
df["E(S_h)"] = df.apply(lambda row: efficiency(row.p,row.p_meta)[2],axis=1)
df["C(S_h)"] = df.apply(lambda row: efficiency(row.p,row.p_meta)[3],axis=1)
df.head()

Unnamed: 0,p,p_meta,E(S_cs),C(S_cs),E(S_h),C(S_h)
0,0.001,0.99,1.761923,3.411246,7.176024,9.980645
1,0.002256,0.985128,1.735121,3.344865,6.825145,9.980645
2,0.003513,0.980256,1.714649,3.298507,6.551389,8.994186
3,0.004769,0.975385,1.694245,3.243187,6.288385,8.994186
4,0.006026,0.970513,1.680243,3.222917,6.044987,8.994186
