In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../datasets/dataset_1.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), 
                                                    df['target'],
                                                    test_size=0.3,
                                                    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

# Remove constant and quasi-constant features first 

In [4]:
const_features = [] 

for col in X_train.columns:
    x = (X_train[col].value_counts()/len(X_train)).sort_values(ascending=False).values[0]
    if x>0.998:
        const_features.append(col)
        
print(len(const_features))

142


In [5]:
X_train.drop(const_features, axis=1, inplace=True)
X_test.drop(const_features, axis=1, inplace=True)

In [6]:
X_train.shape, X_test.shape

((35000, 158), (15000, 158))

# Find duplicated features

In [7]:
duplicated_feature_pair = {}
duplicated_feat = []

for i in range(len(X_train.columns)):
    feat_1 = X_train.columns[i]
    
    if feat_1 not in duplicated_feat:
        duplicated_feature_pair[feat_1] = []

        for feat_2 in X_train.columns[i+1:]:

            if X_train[feat_1].equals(X_train[feat_2]):
                duplicated_feature_pair[feat_1].append(feat_2)
                duplicated_feat.append(feat_2)

In [8]:
duplicated_feature_pair

{'var_4': [],
 'var_5': [],
 'var_8': [],
 'var_13': [],
 'var_15': [],
 'var_17': [],
 'var_18': [],
 'var_19': [],
 'var_21': [],
 'var_22': [],
 'var_25': [],
 'var_26': [],
 'var_27': [],
 'var_29': [],
 'var_30': [],
 'var_31': [],
 'var_35': [],
 'var_37': ['var_148'],
 'var_38': [],
 'var_41': [],
 'var_46': [],
 'var_47': [],
 'var_49': [],
 'var_50': [],
 'var_51': [],
 'var_52': [],
 'var_54': [],
 'var_55': [],
 'var_57': [],
 'var_58': [],
 'var_62': [],
 'var_63': [],
 'var_64': [],
 'var_68': [],
 'var_70': [],
 'var_74': [],
 'var_75': [],
 'var_76': [],
 'var_79': [],
 'var_82': [],
 'var_83': [],
 'var_84': ['var_199'],
 'var_85': [],
 'var_86': [],
 'var_88': [],
 'var_91': [],
 'var_93': [],
 'var_94': [],
 'var_96': [],
 'var_100': [],
 'var_101': [],
 'var_103': [],
 'var_105': [],
 'var_107': [],
 'var_108': [],
 'var_109': [],
 'var_110': [],
 'var_114': [],
 'var_117': [],
 'var_118': [],
 'var_119': [],
 'var_121': [],
 'var_123': [],
 'var_128': [],
 'var_131'

In [9]:
np.array(duplicated_feat)

array(['var_148', 'var_199', 'var_296', 'var_250', 'var_232', 'var_269'],
      dtype='<U7')

In [10]:
len(duplicated_feat)

6

In [11]:
X_train.drop(duplicated_feat, axis=1, inplace=True)
X_test.drop(duplicated_feat, axis=1, inplace=True)

In [12]:
X_train.shape, X_test.shape

((35000, 152), (15000, 152))