In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import random
from random import shuffle

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

In [3]:
df = pd.read_csv("./clean_data/fully_merged_data.csv")
df.head()

Unnamed: 0,sex,age,day,month,MEDHINC_CY,WLTHINDXCY,time_arr,TOTHH_CY,fel_misd
0,M,21,4,0,25336.0,27.0,2310,1309.0,M
1,M,21,4,0,25336.0,27.0,2310,1309.0,M
2,M,24,5,0,25427.0,30.0,16,1359.0,M
3,M,24,5,0,25427.0,30.0,16,1359.0,M
4,M,21,4,0,25336.0,27.0,2310,1309.0,M


In [4]:
need_norm = ["age","MEDHINC_CY", "WLTHINDXCY", "TOTHH_CY", "time_arr"]
norm = (df[need_norm] - df[need_norm].mean())/df[need_norm].std()
norm.mean(), norm.std(), norm.head()

(age           1.486255e-16
 MEDHINC_CY    1.008381e-16
 WLTHINDXCY   -1.062775e-16
 TOTHH_CY     -5.334798e-17
 time_arr     -4.237497e-17
 dtype: float64,
 age           1.0
 MEDHINC_CY    1.0
 WLTHINDXCY    1.0
 TOTHH_CY      1.0
 time_arr      1.0
 dtype: float64,
         age  MEDHINC_CY  WLTHINDXCY  TOTHH_CY  time_arr
 0 -1.008905   -0.796520   -0.501925 -0.448262  1.432337
 1 -1.008905   -0.796520   -0.501925 -0.448262  1.432337
 2 -0.760608   -0.789672   -0.372174 -0.411262 -1.776615
 3 -0.760608   -0.789672   -0.372174 -0.411262 -1.776615
 4 -1.008905   -0.796520   -0.501925 -0.448262  1.432337)

In [5]:
df[need_norm] = norm
df.head()

Unnamed: 0,sex,age,day,month,MEDHINC_CY,WLTHINDXCY,time_arr,TOTHH_CY,fel_misd
0,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M
1,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M
2,M,-0.760608,5,0,-0.789672,-0.372174,-1.776615,-0.411262,M
3,M,-0.760608,5,0,-0.789672,-0.372174,-1.776615,-0.411262,M
4,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M


In [6]:
df = df.dropna()
df.head()

Unnamed: 0,sex,age,day,month,MEDHINC_CY,WLTHINDXCY,time_arr,TOTHH_CY,fel_misd
0,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M
1,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M
2,M,-0.760608,5,0,-0.789672,-0.372174,-1.776615,-0.411262,M
3,M,-0.760608,5,0,-0.789672,-0.372174,-1.776615,-0.411262,M
4,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M


In [7]:
# remove null values
arr = df.index[df["fel_misd"] == ' ']
df = df.drop(arr, axis=0)
arr = df.index[df["fel_misd"] == '\xa0']
df = df.drop(arr, axis=0)

In [8]:
df["fel_misd"].value_counts()

M    46803
F    16407
C     2194
S      240
P       50
Name: fel_misd, dtype: int64

In [57]:
def gen_rand_df(temp_df,samples=1):
    # Assume that temp_df is only populated with same fel_misd class and no one-hot 
    #     encoding
    # Age, MEDHINC_CY, WLTHINDXCY, time_arr, TOTHH_CY should be normalized prior
    #     to calling this function

    
    d = {}
    
    choices = temp_df['sex'].value_counts().index.to_list()
    v_c = temp_df['sex'].value_counts()
    probs = v_c/sum(v_c)
    d['sex'] = np.random.choice(choices, p=probs, size=samples)
    
    choices = temp_df['day'].value_counts().index.to_list()
    v_c = temp_df['day'].value_counts()
    probs = v_c/sum(v_c)    
    d['day'] = np.random.choice(choices, p=probs,size=samples)
    
    choices = temp_df['month'].value_counts().index.to_list()
    v_c = temp_df['month'].value_counts()
    probs = v_c/sum(v_c)    
    d['month'] = np.random.choice(choices, p=probs, size=samples)
    
    x = np.random.normal(0,1,size=(5,samples))
    d['age'] = x[0]
    d['MEDHINC_CY'] = x[1]
    d['WLTHINDXCY'] = x[2]
    d['time_arr'] = x[3]
    d['TOTHH_CY'] = x[4]
    d['fel_misd'] = [temp['fel_misd'].to_list()[0] for i in range(samples)]
    
    df_return = pd.DataFrame.from_dict(d)

    
    return df_return
    
    

            
    

gen_rand_df(temp,10)

Unnamed: 0,sex,day,month,age,MEDHINC_CY,WLTHINDXCY,time_arr,TOTHH_CY,fel_misd
0,F,4,6,0.87332,-1.308341,0.271505,-1.327692,0.283206,P
1,M,0,6,-0.2096,-0.476479,-0.182288,0.827606,-0.906256,P
2,M,2,0,-2.031418,-0.74441,-0.671339,-0.211341,0.844594,P
3,M,3,8,-0.11148,0.381095,0.897973,0.477631,0.039671,P
4,M,2,1,0.643421,-0.936237,0.979847,0.664988,-0.362366,P
5,F,3,9,-0.023214,-0.153399,-1.849027,0.373373,-0.511948,P
6,F,0,2,-0.627902,-1.423969,0.503124,-0.765506,-0.574967,P
7,M,0,9,1.282151,-1.118574,-1.044571,0.46237,-0.858681,P
8,F,4,0,0.743993,-0.651239,-0.493504,0.182836,-0.20254,P
9,M,0,0,0.650971,-0.604406,-0.216758,-0.15992,1.741904,P
