---
# Visualize the Splits of the Classes
- Decrease the variance of datapoints in classes
    - Do not overpower the real data with generated data
    - $\frac{1}{3}$ of undersized data will be generated
        - 1 part original data, 0.5 part generated
- Depends on pandas, numpy, matplotlib, and scipy

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from random import shuffle

import matplotlib.pyplot as plt
import scipy.stats as stats

---
## Read CSV to DF
- Perform normalization to columns that need it

In [None]:
df = pd.read_csv("./clean_data/fully_merged_data.csv")
df = df.dropna()

arr = df.index[df["fel_misd"] == ' ']
df = df.drop(arr, axis=0)
arr = df.index[df["fel_misd"] == '\xa0']
df = df.drop(arr, axis=0)

from scipy.stats import zscore

need_norm = ["age","MEDHINC_CY", "WLTHINDXCY", "TOTHH_CY"]
norm = df[need_norm].apply(zscore)

df[need_norm] = norm

df

---
## Get the Class Counts: Show Stats and Boxplot 

In [None]:
df["fel_misd"].value_counts()

In [None]:
data = df["fel_misd"].value_counts()
median = np.percentile(data, 50)
iqr = stats.iqr(data)
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
print(f"{median=}\n{q1=}\n{q3=}\n{iqr=}")
print(f"Lower outliers: {q1-1.5*iqr}")
print(f"Upper outliers: {q3+1.5*iqr}")

In [None]:
plt.boxplot(data);

---
## Randomly Decrease the Oversize Classes
- Ends same size as median class
- M: 46803 $\to$ 2194
- F: 16407 $\to$ 2194
- C: 2194  $\to$ 2194
- S: 240   $\to$  240
- P: 50    $\to$   50
- Visualize

In [None]:
m_arr = df.index[df["fel_misd"] == 'M'].tolist()
f_arr = df.index[df["fel_misd"] == 'F'].tolist()

shuffle(m_arr)
shuffle(f_arr)

In [None]:
temp = df.drop(m_arr[0:len(m_arr)-2194], axis = 0)
temp = temp.drop(f_arr[0:len(f_arr)-2194], axis = 0)
temp['fel_misd'].value_counts()

In [None]:
data = temp["fel_misd"].value_counts()
median = np.percentile(data, 50)
iqr = stats.iqr(data)
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
print(f"{median=}\n{q1=}\n{q3=}\n{iqr=}")
print(f"Lower outliers: {q1-1.5*iqr}")
print(f"Upper outliers: {q3+1.5*iqr}")

In [None]:
plt.boxplot(data);

---
## Randomly Generate Data For Undersized Class
- M: 2194  $\to$ 2194
- F: 2194  $\to$ 2194
- C: 2194  $\to$ 2194
- S: 240   $\to$  360
- P: 50    $\to$   75

In [None]:
def gen_rand_df(temp_df,samples=1):
    ''' 
    Assume that temp_df is only populated with same fel_misd class and no one-hot 
    encoding
    Age, MEDHINC_CY, WLTHINDXCY, time_arr, TOTHH_CY should be normalized prior
    to calling this function 
    '''

    
    d = {}
    
    choices = temp_df['sex'].value_counts().index.to_list()
    v_c = temp_df['sex'].value_counts()
    probs = v_c/sum(v_c)
    d['sex'] = np.random.choice(choices, p=probs, size=samples)
    
    choices = temp_df['day'].value_counts().index.to_list()
    v_c = temp_df['day'].value_counts()
    probs = v_c/sum(v_c)    
    d['day'] = np.random.choice(choices, p=probs,size=samples)
    
    choices = temp_df['month'].value_counts().index.to_list()
    v_c = temp_df['month'].value_counts()
    probs = v_c/sum(v_c)    
    d['month'] = np.random.choice(choices, p=probs, size=samples)
    
    x = np.random.normal(0,1,size=(5,samples))
    d['age'] = x[0]
    d['MEDHINC_CY'] = x[1]
    d['WLTHINDXCY'] = x[2]
    d['time_arr'] = x[3]
    d['TOTHH_CY'] = x[4]
    d['fel_misd'] = [temp_df['fel_misd'].to_list()[0] for i in range(samples)]
    
    print(temp_df['fel_misd'].to_list()[0])
    df_return = pd.DataFrame.from_dict(d)

    
    return df_return
    
    

            
s_amt = (int) (0.5*240)  
p_amt = (int) (0.5*50)    
inp = temp[temp['fel_misd'] == 'S']
s_temp = gen_rand_df(inp,s_amt)

inp = temp[temp['fel_misd'] == 'P']
p_temp = gen_rand_df(inp,p_amt)

---
## Show New Boxplot of Increased Classes

In [None]:
df = pd.concat([temp,s_temp,p_temp])
df['fel_misd'].value_counts()

In [None]:
plt.boxplot(df['fel_misd'].value_counts());