### Duplicate Features

In [5]:
import pandas as pd
import random

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from feature_engine.selection import DropDuplicateFeatures

In [2]:
# create a toy dataset with redundant and constant features

X,y = make_classification(
        n_samples = 1000,
        n_features = 10,
        n_repeated = 6,
        n_classes = 2,
        random_state = 42
)

X = pd.DataFrame(X)
y = pd.Series(y)



X.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.114931,-0.426246,0.114931,0.124174,0.021285,0.021285,0.124174,0.124174,0.114931,0.114931
1,1.568755,1.574024,1.568755,-1.331536,-1.145407,-1.145407,-1.331536,-1.331536,1.568755,1.568755
2,0.351335,0.152907,0.351335,-0.216485,-0.217748,-0.217748,-0.216485,-0.216485,0.351335,0.351335
3,-1.42262,-1.23405,-1.42262,1.128338,1.001149,1.001149,1.128338,1.128338,-1.42262,-1.42262
4,-1.21712,1.80347,-1.21712,-0.205288,0.30111,0.30111,-0.205288,-0.205288,-1.21712,-1.21712


In [3]:
# split dataset into train and test

X_train,X_test,y_train,y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 42
)

X_train.shape, X_test.shape

((700, 10), (300, 10))

### DropDuplicateFeatures using Feature-engine

In [6]:
selector = DropDuplicateFeatures()

# fits finds the duplicate features
selector.fit(X_train)

In [7]:
# the duplicate features
selector.features_to_drop_

{2, 5, 6, 7, 8, 9}

In [12]:
# groups of duplicate features
selector.duplicated_feature_sets_

[{0, 2, 8, 9}, {3, 6, 7}, {4, 5}]

In [13]:
# drop duplicate features

X_train_t = selector.transform(X_train)
X_test_t = selector.transform(X_test)

X_train_t.shape, X_test_t.shape

((700, 4), (300, 4))

### Pandas implementation of the above 

In [17]:
# create an emplty dictionary where we will store the groups of duplicates

duplicated_groups = {}

# create an emplty list to collect features that are found to be duplicated
_duplicated_lst = []

# iterate over every feature in our dataset

for i in range(0, len(X_train.columns)):
    
    # choose 1 feature
    feat_1 = X_train.columns[i]
    
    # checking if this feature is already picked up as a duplicate feature
    if feat_1 not in _duplicated_lst:
        
        
        # Create an empty list as an entry for this feature in the dictionary
        duplicated_groups[feat_1] = []
        
        # now iterate over the remaining features of the dataset
        for feat_2 in X_train.columns[i+1:]:
            
            # check if feat_2 is identical to feat_1
            if X_train[feat_1].equals(X_train[feat_2]):
                
                # if identical then append it to the list in the dictionary
                duplicated_groups[feat_1].append(feat_2)
                
                # and append it to our list for duplicated variables
                _duplicated_lst.append(feat_2)
                

In [18]:
# number of duplicated features
len(_duplicated_lst)

6

In [19]:
# duplicated features
_duplicated_lst

[2, 8, 9, 6, 7, 5]

In [21]:
# groups of duplicated features
duplicated_groups

{0: [2, 8, 9], 1: [], 3: [6, 7], 4: [5]}

In [22]:
# remove the duplicates
X_train_t = X_train[duplicated_groups.keys()]
X_test_t = X_test[duplicated_groups.keys()]
X_train_t.shape, X_test_t.shape

((700, 4), (300, 4))