# Demonstration of auto feature selection based on correlation and collinearity

## 1. Setup Environment

In [1]:
# Convert jupyter notebook into full screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Reset all variables and objects in notebook
%reset -f 

  from IPython.core.display import display, HTML


## 2. Import library

In [2]:
import pandas as pd
import copy
import json

## 3. Import dataset and different dataframe

In [3]:
all_df = pd.read_csv("dataset.csv")

In [4]:
all_df.head()

Unnamed: 0,Cost,Weight,Weight1,Length,Height,Width,Color
0,242.0,23.2,25.4,30.0,11.52,4.02,Green
1,290.0,24.0,26.3,31.2,12.48,4.3056,Red
2,340.0,23.9,26.5,31.1,12.3778,4.6961,Yellow
3,363.0,26.3,29.0,33.5,12.73,4.4555,Green
4,430.0,26.5,29.0,34.25,12.444,5.134,Red


In [5]:
all_df['Color'].replace(['Green', 'Red', 'Yellow'], [0.0, 1.0, 2.0], inplace=True)
all_df[["Weight", "Weight1", "Length", "Width"]] = all_df[["Weight", "Weight1", "Length", "Width"]].apply(pd.to_numeric, errors='coerce')

In [6]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Cost     159 non-null    float64
 1   Weight   157 non-null    float64
 2   Weight1  156 non-null    float64
 3   Length   155 non-null    float64
 4   Height   158 non-null    float64
 5   Width    157 non-null    float64
 6   Color    159 non-null    float64
dtypes: float64(7)
memory usage: 8.8 KB


## 4. Find correlation between different features

In [7]:
corr = all_df.corr()
print("Please find below correlation between different features")
print(corr)

Please find below correlation between different features
             Cost    Weight   Weight1    Length    Height     Width     Color
Cost     1.000000  0.915702  0.920967  0.925433  0.724575  0.268487 -0.036169
Weight   0.915702  1.000000  0.999534  0.992512  0.627608  0.238791 -0.028960
Weight1  0.920967  0.999534  1.000000  0.994593  0.645978  0.237202 -0.033957
Length   0.925433  0.992512  0.994593  1.000000  0.709255  0.226983 -0.032967
Height   0.724575  0.627608  0.645978  0.709255  1.000000  0.012421 -0.039934
Width    0.268487  0.238791  0.237202  0.226983  0.012421  1.000000  0.000622
Color   -0.036169 -0.028960 -0.033957 -0.032967 -0.039934  0.000622  1.000000


## 5. Find best features by using correlation and correlation

In [8]:
def findBestFeatues(data, target_feature, collinear_threshold=0.80, corr_threshold=None):
    '''
    Input parameter:
    data : DataFrame : Data on which want to perform feature selection
    target_features : String : Target feature
    collinear_threshold : float : This values lies between 0 and 1. If corrleation between two features is greater than this value then those consider as collinear.
    corr_threshold : float : This values lies between 0 and 1. If corrleation between feature and target variable is smaller than this value then that feature consider as less impact on target variable. Then feature is discarded.
    
    Return:
    reduced_features : dict : Contains ignorable features those are not as per correlation and collinear threshold. Feature name is key and the reason why not to consider this feature is  value.
    corr_with_target : dict : Contains best features as per input correlation and collinear threshold. Feature name is key and its correlation with target variable is value.
    '''
    
    corr_obj = json.loads(data.corr().to_json(orient='columns'))
    # flag = True
    reduced_features = dict()
    available_features = list(corr_obj.keys())
    available_features.remove(target_feature) # Available feature does not contains target variable
    size_feature = len(available_features);
    dropFeature = set();
    i = 0;
    
    explaination_collinear = "Correlation between feature {0} and {1} is {2}. As collinear threshold is {3} hence they are collinear. As correlation of feature {0} and target variable {4} is {5} higher than the correlation between feature {1} and target variable {4} is {6}. Hence we can drop feture {1}"
    # 0 : feature1, 1 : feature2 (drop), 2: correlation between feature1 and feture2,  3 : threshold, 4 : target variable, 5 : corr between feature1 and target variable, 6 : corr between feature2 and target variable
    
    explaination_corr = "Correlation threshold is {0} and correlation between feature {1} and target variable {2} is {3}"
    # 0 : Correlation threshold, 1 : feature, 2 : target feature,  3 : Correlation between feature and target feature

    # Drop all correlation with same feature and correlation of features with target variable 
    # We will keep correlation of target variable and features in single dictionary
    for feature in corr_obj.keys():
        del corr_obj[feature][feature];
        if feature != target_feature:
            del corr_obj[feature][target_feature];
            
            
    # Remove all features those having less correlation with target variable than correlation threshold
    if corr_threshold != None:
        for feature in corr_obj[target_feature].keys():
            if abs(corr_obj[target_feature][feature]) < corr_threshold:
                reduced_features[feature] = explaination_corr.format(corr_threshold, feature, target_feature, abs(corr_obj[target_feature][feature]));  
                dropFeature.add(feature)
                
        for feature in dropFeature:
            del corr_obj[feature];
            del corr_obj[target_feature][feature];
            available_features.remove(feature);
            size_feature = size_feature - 1;
            
    # Remove all features those having more than threshold collinearity
    while i < size_feature:
        j = i + 1
        dropFeature = set() # Want to keep unique values
        feature1 = available_features[i]
        feature1_removed = False # check whether feture is getting removed or not
        
        while j < size_feature:
            feature2 = available_features[j]
            if abs(corr_obj[feature1][feature2]) >= collinear_threshold:
                corr_of_features = abs(corr_obj[feature1][feature2]);
                corr_feature1_with_target = corr_obj[target_feature][feature1]
                corr_feature2_with_target = corr_obj[target_feature][feature2]
                                
                if corr_obj[target_feature][feature1] >= corr_obj[target_feature][feature2]:
                    dropFeature.add(feature2);
                    reduced_features[feature2] = explaination_collinear.format(feature1, feature2, corr_of_features, collinear_threshold, target_feature, corr_feature1_with_target, corr_feature2_with_target);
                else:
                    dropFeature.add(feature1);
                    reduced_features[feature1] = explaination_collinear.format(feature2, feature1, corr_of_features, collinear_threshold, target_feature, corr_feature2_with_target, corr_feature1_with_target);
                    feature1_removed = True;
            j = j + 1;
        
        for feature1  in dropFeature:
            available_features.remove(feature1);
            del corr_obj[feature1];
            for feature2 in corr_obj.keys():
                del corr_obj[feature2][feature1];
        
        if feature1_removed == True:
            size_feature = size_feature - 1;
        else:
            i = i + 1;
    corr_with_target = dict()
    
    for feature1, corr1 in corr_obj[target_feature].items():
        corr_with_target[feature1] = corr1
            
    return reduced_features, corr_with_target;

In [9]:
target_feature = "Cost"
explained_corr, remaing_corr = findBestFeatues(all_df, target_feature, collinear_threshold=0.80, corr_threshold=0.70)

In [10]:
print("Please find below feature which we can drop with explanation\n")
for feature1, val1 in explained_corr.items():
    print("{0}  :  {1} \n".format(feature1, val1))

Please find below feature which we can drop with explanation

Width  :  Correlation threshold is 0.7 and correlation between feature Width and target variable Cost is 0.2684874245 

Color  :  Correlation threshold is 0.7 and correlation between feature Color and target variable Cost is 0.0361686379 

Weight  :  Correlation between feature Length and Weight is 0.9925119742. As collinear threshold is 0.8 hence they are collinear. As correlation of feature Length and target variable Cost is 0.9254328957 higher than the correlation between feature Weight and target variable Cost is 0.9157022523. Hence we can drop feture Weight 

Weight1  :  Correlation between feature Length and Weight1 is 0.9945926311. As collinear threshold is 0.8 hence they are collinear. As correlation of feature Length and target variable Cost is 0.9254328957 higher than the correlation between feature Weight1 and target variable Cost is 0.9209665712. Hence we can drop feture Weight1 



In [11]:
print("Please find below feature which we can drop : ")
for feature in explained_corr.keys():
    print(feature)

Please find below feature which we can drop : 
Width
Color
Weight
Weight1


In [12]:
print("Below features have the best correlation with target variable {0} and selected after keeping correlation threshold 0.70 and collinear threshold 0.80".format(target_feature))
for feature1, val1 in remaing_corr.items():
    print("{0}  :  {1} ".format(feature1, val1))

Below features have the best correlation with target variable Cost and selected after keeping correlation threshold 0.70 and collinear threshold 0.80
Length  :  0.9254328957 
Height  :  0.7245751661 


## 6. Cleanup

In [13]:
# Reset all variables and objects in notebook
%reset -f 