# CorrelatedFeatures Example

In [5]:
import pandas as pd
import numpy as np
from raimitigations.dataprocessing import CorrelatedFeatures

# 1 - HR Dataset

In [6]:
data_dir = '../../../datasets/hr_promotion'
df =  pd.read_csv(data_dir + '/train.csv')
df.drop(columns=['employee_id'], inplace=True)
df

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49,0
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,50,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,73,0
...,...,...,...,...,...,...,...,...,...,...,...,...
54803,Technology,region_14,Bachelor's,m,sourcing,1,48,3.0,17,0,78,0
54804,Operations,region_27,Master's & above,f,other,1,37,2.0,6,0,56,0
54805,Analytics,region_1,Bachelor's,m,other,1,27,5.0,3,0,79,0
54806,Sales & Marketing,region_9,,m,sourcing,1,29,1.0,2,0,45,0


In [7]:
label_col = "is_promoted"
cor_feat = CorrelatedFeatures(
					method_num_num=["spearman", "pearson", "kendall"],
					method_num_cat="model",
					save_json=False
				)
cor_feat.fit(df=df, label_col=label_col)
_ = cor_feat.get_summary()

No correlations detected. Nothing to be done here.

CORRELATION SUMMARY


NOT CORRELATED VARIABLES SUMMARY

department x region:
	* Cramer's V = 0.1342 with a p-value of 0.0
department x education:
	* Cramer's V = 0.1243 with a p-value of 0.0
department x gender:
	* Cramer's V = 0.2866 with a p-value of 0.0
department x recruitment_channel:
	* Cramer's V = 0.0634 with a p-value of 0.0
department x no_of_trainings:
	Model metrics:
	precision = 0.12673642797753037
	recall = 0.11134860176670285
	f1 = 0.052750960416829404
	auc = 0.500139246590814
	accuracy = 0.30748646840600863
department x age:
	Model metrics:
	precision = 0.034138133754991994
	recall = 0.1111111111111111
	f1 = 0.052229200589284326
	auc = 0.5
	accuracy = 0.3072432037949279
department x previous_year_rating:
	Model metrics:
	precision = 0.0340435792888772
	recall = 0.1111111111111111
	f1 = 0.052118466314288105
	auc = 0.5
	accuracy = 0.30639221359989477
department x length_of_service:
	Model metrics:
	precision = 0.07955244

# 2 - Toy Dataset

In [8]:
from raimitigations.dataprocessing import create_dummy_dataset

df = create_dummy_dataset(
                    samples=3000, 
                    n_features=6, 
                    n_num_num=2, 
                    n_cat_num=2,
                    n_cat_cat=2,
                    num_num_noise=[0.01, 0.05],
                    pct_change=[0.05, 0.1]
                )
label_col = "label"
df

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,label,num_c0_num_0,num_c1_num_1,CN_0_num_0,CN_1_num_1,CC_0_num_0,CC_1_num_1
0,-1.525512,2.875543,2.257682,2.229220,1.594384,2.885251,1,-1.532230,2.862426,val0_1,val1_3,val0_2,val1_2
1,-2.569836,1.278600,1.688270,3.411097,-0.641013,3.952127,1,-2.580926,1.287197,val0_2,val1_2,val0_1,val1_1
2,-0.368307,2.262619,0.323905,4.295313,2.963440,2.924983,1,-0.352219,2.264692,val0_2,val1_2,val0_2,val1_2
3,-3.478053,3.089150,3.671529,1.992963,1.161017,3.281400,1,-3.462437,3.066536,val0_1,val1_3,val0_1,val1_2
4,1.100563,-0.712498,-3.326899,1.284548,1.673979,-5.369617,0,1.096297,-0.700658,val0_2,val1_1,val0_0,val1_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.028262,4.294016,1.284022,3.606359,4.288832,1.517827,1,0.096331,4.269560,val0_2,val1_3,val0_2,val1_3
2996,-1.736805,1.233513,1.480658,2.275031,0.723627,2.402492,1,-1.746102,1.226363,val0_1,val1_2,val0_2,val1_1
2997,-2.524129,0.929163,2.538396,0.126336,2.265812,1.555137,1,-2.523848,0.935757,val0_1,val1_2,val0_1,val1_1
2998,-1.763996,0.636608,2.183026,0.499517,2.560602,1.656918,1,-1.822879,0.637048,val0_1,val1_1,val0_2,val1_1


## Categorical x Numerical Correlation using the Model approach

In [9]:
cor_feat = CorrelatedFeatures(
					method_num_num=["spearman", "pearson", "kendall"],
					num_corr_th=0.8,
					num_pvalue_th=0.01,
					method_num_cat="model",
					model_metrics=["f1"],
					metric_th=0.9,
					cat_corr_th=0.8,
					cat_pvalue_th=0.01,
					tie_method="missing",
					json_summary="./corr_json_examples/1_summary_model.json",
					json_corr="./corr_json_examples/1_corr_model.json",
					json_uncorr="./corr_json_examples/1_uncorr_model.json"
				)
cor_feat.fit(df=df, label_col=label_col)
_ = cor_feat.get_summary()
cor_feat.get_selected_features()


CORRELATION SUMMARY

1 - num_0 x num_c0_num_0:
	* spearman correlation = 0.9996753901861544 with a p-value of 0.0
	* pearson correlation = 0.9997597612105206 with a p-value of 0.0
	* kendall correlation = 0.9851981771701679 with a p-value of 0.0
2 - num_0 x CN_0_num_0:
	Model metrics:
	precision = 0.9390431872868645
	recall = 0.9166065542867073
	f1 = 0.9273329952270064
	auc = 0.9492795862856611
	accuracy = 0.9577777777777777
3 - num_1 x num_c1_num_1:
	* spearman correlation = 0.999818912868768 with a p-value of 0.0
	* pearson correlation = 0.9998418004390127 with a p-value of 0.0
	* kendall correlation = 0.9888496165388463 with a p-value of 0.0
4 - num_1 x CN_1_num_1:
	Model metrics:
	precision = 0.8901472472787961
	recall = 0.9182315949520383
	f1 = 0.9030630246432361
	auc = 0.9553143997815823
	accuracy = 0.9711111111111111
5 - num_1 x CC_1_num_1:
	Model metrics:
	precision = 0.9389101613965745
	recall = 0.9372035692679975
	f1 = 0.9377419833148879
	auc = 0.9649211710562328
	accuracy =

['num_2',
 'num_3',
 'num_4',
 'num_5',
 'num_c0_num_0',
 'CN_0_num_0',
 'CN_1_num_1',
 'CC_0_num_0',
 'CC_1_num_1']

## Categorical x Numerical Correlation using the Jensen approach

In [10]:
cor_feat = CorrelatedFeatures(
					method_num_num=["kendall"],
					num_corr_th=0.8,
					num_pvalue_th=0.01,
					method_num_cat="jensen",
					jensen_n_bins=200,
					jensen_th=0.9,
					tie_method="var",
					json_summary="./corr_json_examples/2_summary_jensen.json",
					json_corr="./corr_json_examples/2_corr_jensen.json",
					json_uncorr="./corr_json_examples/2_uncorr_jensen.json"
				)
cor_feat.fit(df=df, label_col=label_col)
_ = cor_feat.get_summary()
cor_feat.get_selected_features()


CORRELATION SUMMARY

1 - num_0 x num_c0_num_0:
	* kendall correlation = 0.9851981771701679 with a p-value of 0.0
2 - num_0 x CN_0_num_0:
	Jensen-Shannon results:
	jensen val0_1 x val0_2 = 0.9752599874257479
	jensen val0_1 x val0_3 = 0.9308744294656628
	jensen val0_1 x val0_0 = 0.9094893510593968
	jensen val0_2 x val0_3 = 0.9655643222237329
	jensen val0_2 x val0_0 = 0.9581523012900924
	jensen val0_3 x val0_0 = 0.9663876399222759
3 - num_1 x num_c1_num_1:
	* kendall correlation = 0.9888496165388463 with a p-value of 0.0
4 - num_1 x CC_1_num_1:
	Jensen-Shannon results:
	jensen val1_2 x val1_1 = 0.9929982619772626
	jensen val1_2 x val1_0 = 0.9581764256258682
	jensen val1_2 x val1_3 = 0.9805283754190949
	jensen val1_1 x val1_0 = 0.9474890759069279
	jensen val1_1 x val1_3 = 0.9851998562127596
	jensen val1_0 x val1_3 = 0.982841358269058
5 - num_c1_num_1 x CC_1_num_1:
	Jensen-Shannon results:
	jensen val1_2 x val1_1 = 0.983292600101525
	jensen val1_2 x val1_0 = 0.9523441810386128
	jensen val1

['num_2',
 'num_3',
 'num_4',
 'num_5',
 'num_c0_num_0',
 'num_c1_num_1',
 'CN_0_num_0',
 'CN_1_num_1',
 'CC_0_num_0']

## Categorical x Numerical Correlation using the ANOVA approach

In [11]:
cor_feat = CorrelatedFeatures(
					method_num_num=["kendall"],
					num_corr_th=0.8,
					num_pvalue_th=0.01,
					method_num_cat="anova",
					levene_pvalue=0.01,
					anova_pvalue=0.05,
					omega_th=0.75,
					tie_method="cardinality",
					json_summary="./corr_json_examples/3_summary_anova.json",
					json_corr="./corr_json_examples/3_corr_anova.json",
					json_uncorr="./corr_json_examples/3_uncorr_anova.json"
				)
cor_feat.fit(df=df, label_col=label_col)
_ = cor_feat.get_summary()
cor_feat.get_selected_features()


CORRELATION SUMMARY

1 - num_0 x num_c0_num_0:
	* kendall correlation = 0.9851981771701679 with a p-value of 0.0
2 - num_1 x num_c1_num_1:
	* kendall correlation = 0.9888496165388463 with a p-value of 0.0

NOT CORRELATED VARIABLES SUMMARY

num_0 x num_1:
	* kendall correlation = 0.06940180060020007 with a p-value of 1.2078434311716896e-08
num_0 x num_2:
	* kendall correlation = -0.5817730354562632 with a p-value of 0.0
num_0 x num_3:
	* kendall correlation = 0.09549583194398133 with a p-value of 4.462291324627264e-15
num_0 x num_4:
	* kendall correlation = -0.06240702456374347 with a p-value of 2.9869115112107524e-07
num_0 x num_5:
	* kendall correlation = -0.07602889852172946 with a p-value of 4.298897913602809e-10
num_0 x num_c1_num_1:
	* kendall correlation = 0.06918572857619205 with a p-value of 1.3400845076967727e-08
num_0 x CN_0_num_0:
	ANOVA results:
	P-Value for the Levene's Test of Homoscedasticity = 1.1298738572415836e-08
	Valid Anova (Homoscedasticity data)?: False
	ANOVA F

['num_0',
 'num_1',
 'num_2',
 'num_3',
 'num_4',
 'num_5',
 'CN_0_num_0',
 'CN_1_num_1',
 'CC_0_num_0',
 'CC_1_num_1']

## Check for correlations only between categorical variables

In [12]:
cor_feat = CorrelatedFeatures(
					method_num_num=None,
					method_num_cat=None,
					json_summary="./corr_json_examples/4_summary_cat.json",
					json_corr="./corr_json_examples/4_corr_cat.json",
					json_uncorr="./corr_json_examples/4_uncorr_cat.json"
				)
cor_feat.fit(df=df, label_col=label_col)
_ = cor_feat.get_summary()
cor_feat.get_selected_features()

No correlations detected. Nothing to be done here.

CORRELATION SUMMARY


NOT CORRELATED VARIABLES SUMMARY

CN_0_num_0 x CN_1_num_1:
	* Cramer's V = 0.0873 with a p-value of 0.0
CN_0_num_0 x CC_0_num_0:
	* Cramer's V = 0.6074 with a p-value of 0.0
CN_0_num_0 x CC_1_num_1:
	* Cramer's V = 0.0853 with a p-value of 0.0
CN_1_num_1 x CC_0_num_0:
	* Cramer's V = 0.0862 with a p-value of 0.0
CN_1_num_1 x CC_1_num_1:
	* Cramer's V = 0.6001 with a p-value of 0.0
CC_0_num_0 x CC_1_num_1:
	* Cramer's V = 0.0928 with a p-value of 0.0


['num_0',
 'num_1',
 'num_2',
 'num_3',
 'num_4',
 'num_5',
 'num_c0_num_0',
 'num_c1_num_1',
 'CN_0_num_0',
 'CN_1_num_1',
 'CC_0_num_0',
 'CC_1_num_1']