In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.svm import LinearSVC
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
from tqdm import tqdm

In [3]:
size = 100

In [4]:
def sigmoid(x, alpha):
    z = np.exp(-x+alpha)
    sig = 1 / (1 + z)
    return sig

In [5]:
#Categorical data
x_1 = np.random.normal(40, 10, size = size)
x_2 = np.random.binomial(1,0.65, size = size)
y = np.around(sigmoid(x_1+x_2*20, alpha = 50)).astype(int)
synth_cat = pd.DataFrame({"y": y, "x_1":x_1, "x_2":x_2})
synth_cat_test = synth_cat.iloc[:round(0.3*size),:]
synth_cat_train = synth_cat.iloc[round(0.3*size):,:]

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
def data_remover_cat(full_data, missing_col, missing_pct, missing = "mar", impute = "cca"):
    #Missing_pct is in the range 0 to 100
    data = full_data.copy()

    if missing =="mar":
        x = data.drop(missing_col, axis = 1)
        if data[missing_col].nunique() ==2:
            clf = LogisticRegression(random_state=0).fit(x, data[missing_col])
            preds = clf.predict_proba(x)[:,1]
        else:
            clf = LinearRegression().fit(x, data[missing_col])
            preds = clf.predict(x)
        #print(preds)
        lower_percentile = np.percentile(preds, missing_pct//2)
        upper_percentile = np.percentile(preds, 100-missing_pct//2)
        """print("lower", lower_percentile,
            "upper", upper_percentile,
            "filtered", preds[(preds>=lower_percentile)&(preds<=upper_percentile)])
        
        #print("Mask", sum((data["preds"]<= lower_percentile)| (data["preds"]>= upper_percentile)))"""
        data["preds"] = preds
        data[missing_col] = data[missing_col].mask((data["preds"]<= lower_percentile) | (data["preds"]>= upper_percentile),
            other = np.nan)
        data.drop("preds", axis = 1, inplace = True)
        
    else:
        mcar = np.random.binomial(n=1, p=missing_pct/100, size = len(data))
        data["missing"] = [np.nan if m ==1 else 0 for m in mcar]
        data[missing_col] = data[missing_col].mask(data["missing"] == np.nan,
            other = np.nan)
        data.drop("missing", axis = 1, inplace = True)
        
    if impute =="cca":
        data.dropna(axis = 0, inplace = True)
    elif impute == "mean":
        if data[missing_col].nunique() ==2:
            #print("nans", data[missing_col].isna().sum())
            mode = data[missing_col].mode(dropna = True)[0]
            #print("mode", mode, "END")
            data[missing_col] = data[missing_col].fillna(mode)
            #print("nans", data[missing_col].isna().sum())
        else:
            mean = data[missing_col].mean(skipna = True)
            data[missing_col] = data[missing_col].fillna(mean)
    elif impute == "reg":
        pass
    elif impute == "mice_def":
        imputer = IterativeImputer(random_state=0)
        imputer.fit(data)
        data = pd.DataFrame(imputer.transform(data), columns = data.columns)
        #print(data[missing_col].unique())
        if data[missing_col].nunique() ==2:
            data[missing_col] = data[missing_col].round()
        #print(data[missing_col].unique())
    elif impute == "mice_reg":
        if data[missing_col].nunique() ==2:
            model = LogisticRegression(random_state=0, max_iter=300)
            imputer = IterativeImputer(estimator = model, random_state=0)
            imputer.fit(data)
            data = pd.DataFrame(imputer.transform(data), columns = data.columns)
            #print(data[missing_col].unique())
        else:
            model = LinearRegression()
            imputer = IterativeImputer(estimator = model, random_state=0)
            imputer.fit(data)
            data = pd.DataFrame(imputer.transform(data), columns = data.columns)
    return data




In [7]:
print("Class 1: ",len(synth_cat_train[synth_cat_train["x_2"]==1])/len(synth_cat_train) ,
    "\nClass 2: ", len(synth_cat_train[synth_cat_train["x_2"]==0])/len(synth_cat_train))
temp = synth_cat_train.copy()
temp = data_remover_cat(temp, "x_2", 30)
print("Class 1: ",len(temp[temp["x_2"]==1])/len(temp) ,
    "\nClass 2: ", len(temp[temp["x_2"]==0])/len(temp))

Class 1:  0.6571428571428571 
Class 2:  0.34285714285714286
Class 1:  0.7291666666666666 
Class 2:  0.2708333333333333
