# IMPORTS 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib.colors import LogNorm
import datetime
import time

# DATA 

## DATAFRAMES 

In [3]:
df_base = '/eos/user/p/plawski/SWAN_projects/RzymskiKociolek/Histograms/analysis_multi_single/code/single/plots/statistics/'


CHI2N_DF =  pd.read_csv(df_base + 'DF_NAME_CHI2N.csv')
DIST_AVG_DF =  pd.read_csv(df_base + 'DF_NAME_DIST_AVG.csv')
EXEC_TIME_DF =  pd.read_csv(df_base + 'DF_NAME_EXEC_TIME.csv')
TOTAL_STRIP_20_DF =  pd.read_csv(df_base + 'DF_NAME_TOTAL_STRIP_20.csv')
TOTAL_STRIP_22_DF =  pd.read_csv(df_base + 'DF_NAME_TOTAL_STRIP_22.csv')
X_STRIP_20_DF =  pd.read_csv(df_base + 'DF_NAME_X_STRIP_20.csv')
X_STRIP_22_DF =  pd.read_csv(df_base + 'DF_NAME_X_STRIP_22.csv')
Y_STRIP_20_DF =  pd.read_csv(df_base + 'DF_NAME_Y_STRIP_20.csv')
Y_STRIP_22_DF =  pd.read_csv(df_base + 'DF_NAME_Y_STRIP_22.csv')

## COLUMNS 

In [4]:
BIN_PERCENT_COL = 'bin_percent'
BIN_STACK_PERCENT_COL = 'bin_stack_percent'
BIN_STARTS_COL = 'bin_starts'
BIN_VALUES_COL = 'bin_values'

# STATS 

## CHI2 stats 

In [5]:
def get_number_of_bins_for_percent(df, bin_size, n, use_bin_size=True):
    df_above = df.loc[df['bin_stack_percent'] > n]
    if len(df_above) == 0:
        return None
    
    if use_bin_size:
        return int(df_above.iloc[0]['bin_starts'] / bin_size) + 1
    
    return df_above.iloc[0]['bin_starts'] + bin_size

In [11]:
def get_stack_percent_first_n_bins(df, n):
    return df['bin_stack_percent'][n]

In [49]:
def get_useful_data(df, df_name, use_bin_size=True, rpID=20):
    bin_size = df['bin_starts'][1] - df['bin_starts'][0]
    bin_1_stack_percent = get_stack_percent_first_n_bins(df, 1)  # percent in first 1 bin
    bin_2_stack_percent = get_stack_percent_first_n_bins(df, 2)  # percent in first 2 bin
    #     bin_3_stack_percent = get_stack_percent_first_n_bins(df, 3)  # percent in first 3 bin
    #     bin_4_stack_percent = get_stack_percent_first_n_bins(df, 4)  # percent in first 4 bin
    bin_5_stack_percent = get_stack_percent_first_n_bins(df, 5)  # percent in first 5 bin
    bin_10_stack_percent = get_stack_percent_first_n_bins(df, 10)  # percent in first 10 bin
    #     bin_20_stack_percent = get_stack_percent_first_n_bins(df, 20)  # percent in first 10 bin
    percent_50_bin = get_number_of_bins_for_percent(df, bin_size, 50.0, use_bin_size=use_bin_size)
    percent_90_bin = get_number_of_bins_for_percent(df, bin_size, 90.0, use_bin_size=use_bin_size)
    percent_95_bin = get_number_of_bins_for_percent(df, bin_size, 95.0, use_bin_size=use_bin_size)
    percent_99_bin = get_number_of_bins_for_percent(df, bin_size, 99.0, use_bin_size=use_bin_size)
    percent_1_9_bin = get_number_of_bins_for_percent(df, bin_size, 99.9, use_bin_size=use_bin_size)
    percent_2_9_bin = get_number_of_bins_for_percent(df, bin_size, 99.99, use_bin_size=use_bin_size)
    percent_3_9_bin = get_number_of_bins_for_percent(df, bin_size, 99.999, use_bin_size=use_bin_size)
    percent_4_9_bin = get_number_of_bins_for_percent(df, bin_size, 99.9999, use_bin_size=use_bin_size)
    percent_5_9_bin = get_number_of_bins_for_percent(df, bin_size, 99.99999, use_bin_size=use_bin_size)
    percent_6_9_bin = get_number_of_bins_for_percent(df, bin_size, 99.99999, use_bin_size=use_bin_size)

    if rpID == 20:
        print("{}\t{}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{}\t{}\t{}\t{}".format(df_name, bin_size,
                                                                                  bin_1_stack_percent,
                                                                                  bin_2_stack_percent,
                                                                                  bin_5_stack_percent,
                                                                                  bin_10_stack_percent,
                                                                                  percent_50_bin,
                                                                                  percent_99_bin,
                                                                                  percent_1_9_bin,
                                                                                  percent_2_9_bin))
    if rpID == 22:
        print("{}\t{}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{}\t{}\t{}\t{}".format(df_name, bin_size,
                                                                              bin_1_stack_percent,
                                                                              bin_2_stack_percent,
                                                                              bin_5_stack_percent,
                                                                              bin_10_stack_percent,
                                                                              percent_50_bin,
                                                                              percent_90_bin,
                                                                              percent_95_bin,
                                                                              percent_99_bin))

    if rpID == -1:
        print("{}\t{}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{}\t{}\t{}\t{}".format(df_name, bin_size,
                                                                              bin_1_stack_percent,
                                                                              bin_2_stack_percent,
                                                                              bin_5_stack_percent,
                                                                              bin_10_stack_percent,
                                                                              percent_50_bin,
                                                                              percent_90_bin,
                                                                              percent_95_bin,
                                                                              percent_99_bin))

In [35]:
print("df_name\t\tbin_size\t1_bin\t2_bin\t5_bin\t10_bin\t50%\t99%\t99.9%\t99.99%")
get_useful_data(TOTAL_STRIP_20_DF, "TOTAL_STRIP_20_DF", 20)
get_useful_data(X_STRIP_20_DF, "X_STRIP_20_DF\t", rpID=20)
get_useful_data(Y_STRIP_20_DF, "Y_STRIP_20_DF\t", rpID=20)

df_name		bin_size	1_bin	2_bin	5_bin	10_bin	50%	99%	99.9%	99.99%
TOTAL_STRIP_20_DF	1.0	3.06	98.18	99.94	99.95	3	4	5	None
X_STRIP_20_DF		1.0	95.72	99.90	99.95	99.96	2	3	3	97
Y_STRIP_20_DF		1.0	6.79	99.45	99.95	99.96	3	3	4	None


In [36]:
print("df_name\t\tbin_size\t1_bin\t2_bin\t5_bin\t10_bin\t50%\t90%\t95%\t99%")
get_useful_data(TOTAL_STRIP_22_DF, "TOTAL_STRIP_22_DF", rpID=22)
get_useful_data(X_STRIP_22_DF, "X_STRIP_22_DF\t", rpID=22)
get_useful_data(Y_STRIP_22_DF, "Y_STRIP_22_DF\t", rpID=22)

df_name		bin_size	1_bin	2_bin	5_bin	10_bin	50%	90%	95%	99%
TOTAL_STRIP_22_DF	1.0	12.72	84.60	85.17	85.66	3	25	30	None
X_STRIP_22_DF		1.0	84.61	85.22	85.67	86.38	2	18	21	None
Y_STRIP_22_DF		1.0	17.42	85.72	86.53	87.83	3	16	20	None


In [50]:
print("df_name\t\tbin_size\t1_bin\t2_bin\t5_bin\t10_bin\t50%\t90%\t95%\t99%")
get_useful_data(CHI2N_DF, "CHI2N_DF", rpID=-1)
get_useful_data(DIST_AVG_DF, "DIST_AVG_DF", rpID=-1, use_bin_size=False)
get_useful_data(EXEC_TIME_DF, "EXEC_TIME_DF", rpID=-1, use_bin_size=False)

df_name		bin_size	1_bin	2_bin	5_bin	10_bin	50%	90%	95%	99%
CHI2N_DF	1.0	74.48	83.86	93.71	96.61	1	5	8	None
DIST_AVG_DF	0.1	97.65	97.96	98.61	99.44	0.1	0.1	0.1	0.9
EXEC_TIME_DF	0.01	0.69	41.26	78.82	99.90	0.04	0.07	0.08	0.08


In [15]:
TOTAL_STRIP_22_DF

Unnamed: 0.1,Unnamed: 0,bin_percent,bin_stack_percent,bin_starts,bin_values
0,0,0.000000,0.000000,0.0,0.0
1,1,12.719558,12.719558,1.0,58997.0
2,2,71.881879,84.601437,2.0,333409.0
3,3,0.305501,84.906938,3.0,1417.0
4,4,0.134317,85.041254,4.0,623.0
5,5,0.130436,85.171690,5.0,605.0
6,6,0.093785,85.265475,6.0,435.0
7,7,0.103486,85.368961,7.0,480.0
8,8,0.111679,85.480640,8.0,518.0
9,9,0.097665,85.578306,9.0,453.0
