# Section 2.4 Filter

In [1]:
import time
import numpy as np
import pandas as pd
from typing import *
import json

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
def get_root_dir():
    if os.path.exists('/content/drive/MyDrive/'):
        return '/content/drive/MyDrive/Colab/4-AMEX/AMEX Project/notebooks' #在Colab里
    else:
        return './' #在本地

#调用系统命令，相当于cd，但是直接!cd是不行的
os.chdir(get_root_dir())

当特征太多时，模型效率会大幅度降低，因此面对树模型以及集成模型的时候，我们仍然需要特征筛选，优先带入有效特征进行训练。

本节使用相关系数进行特征筛选。

## Try with train_interact1

In [None]:
train_interact1 = pd.read_parquet("../data/6-interaction/train_interact1.parquet", engine='pyarrow')
print(train_interact1.shape)
train_interact1.head()

(458913, 781)


Unnamed: 0,customer_ID,B_30&0&S_9,B_30&0&R_2,B_30&0&S_23,B_30&0&D_129,B_30&0&S_13,B_30&0&R_4,B_30&0&D_111,B_30&0&D_79,B_30&0&D_91,...,B_38&-1&D_72,B_38&-1&R_10,B_38&-1&D_107,B_38&-1&D_54,B_38&-1&D_96,B_38&-1&S_25,B_38&-1&D_145,B_38&-1&D_53,B_38&-1&D_125,B_38&-1&D_52
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.517637,0.0,1.755472,13.0,8738.0,0.0,-13.0,0.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.560871,0.0,1.762985,0.0,1885.0,0.0,-13.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.346566,0.0,1.751728,0.0,0.0,0.0,-13.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.260058,0.0,1.781779,13.0,5655.0,0.0,-13.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.559535,0.0,1.761758,13.0,0.0,0.0,-13.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_target = pd.read_csv("../data/1-original-data/train_labels.csv")
print(train_target.shape)
train_target.head(2)

In [None]:
train_interact1 = train_interact1.merge(train_target, on='customer_ID')
print(train_interact1.shape)
train_interact1.head(2)

In [None]:
del train_target

In [None]:
train_interact1.isnull().sum()[train_interact1.isnull().sum() != 0]

压缩后的数据集有45万多行，说明训练集中有45万多个不同的客户。

由于我们现在使用的只是基础版的压缩后的数据，因此很多列会存在缺失值。不过终版压缩后的数据我们不允许出现缺失值。

In [None]:
1 - np.count_nonzero(train_interact1) / train_interact1.size # 特征矩阵稀疏程度

## Loop through all train_interact dfs to get correlations with target for all interaction features

In [None]:
def get_pearson(df: pd.DataFrame, targets: pd.DataFrame) -> Tuple[List[str], List[str]]:
    '''
    Return the list of features names and their correlations with target
    '''
    # join with targets
    df = df.merge(targets, on='customer_ID')
    print(df.shape)
    # extract feature names
    features = df.columns.tolist()
    features.remove("customer_ID")
    features.remove("target")
    featureSelect = features[:]
    # calculate correlation coefficients
    corr = []
    for fea in featureSelect:
        corr.append(abs(df[[fea, 'target']].fillna(0).corr().values[0][1]))
    return featureSelect, corr    

In [None]:
def get_all_pearson() -> pd.DataFrame:
    '''
    Get pearson for all interaction features
    '''
    print("Calculating person correlation")
    all_features = []
    all_corrs = []
    # load targets
    targets = pd.read_csv("../data/1-original-data/train_labels.csv")
    # loop through train_interact dfs
    for i in range(1, 17):
      # load data
      path = "../data/6-interaction/train/train_interact" + str(i) + ".parquet"
      print(f"Processing {path}")
      df = pd.read_parquet(path, engine='pyarrow')
      # get corrs
      features, corrs = get_pearson(df, targets)
      all_features.extend(features)
      all_corrs.extend(corrs)

    result = {'feature': all_features, 'correlation': all_corrs}
    result = pd.DataFrame(result)
    return result

In [None]:
result = get_all_pearson()
result.to_csv("../data/7-FeatureSelection/Part1/correlations.csv", index=False)

Calculating person correlation
Processing ../data/6-interaction/train/train_interact1.parquet
(458913, 392)
Processing ../data/6-interaction/train/train_interact2.parquet
(458913, 392)
Processing ../data/6-interaction/train/train_interact3.parquet
(458913, 392)
Processing ../data/6-interaction/train/train_interact4.parquet
(458913, 392)
Processing ../data/6-interaction/train/train_interact5.parquet
(458913, 392)
Processing ../data/6-interaction/train/train_interact6.parquet
(458913, 392)
Processing ../data/6-interaction/train/train_interact7.parquet
(458913, 392)
Processing ../data/6-interaction/train/train_interact8.parquet
(458913, 470)
Processing ../data/6-interaction/train/train_interact9.parquet
(458913, 392)
Processing ../data/6-interaction/train/train_interact10.parquet
(458913, 392)
Processing ../data/6-interaction/train/train_interact11.parquet
(458913, 392)
Processing ../data/6-interaction/train/train_interact12.parquet
(458913, 392)
Processing ../data/6-interaction/train/tra

Check correlation statistics

In [None]:
print(result.shape)
result.head()

(6396, 2)


Unnamed: 0,feature,correlation
0,B_30&0&S_9,0.00524
1,B_30&0&R_2,0.298032
2,B_30&0&S_23,0.016373
3,B_30&0&D_129,0.241736
4,B_30&0&S_13,0.181882


In [None]:
print(f"{result.isna().sum()} features have NA for correlation")

feature         0
correlation    57
dtype: int64 features have NA for correlation


In [None]:
result.describe()

Unnamed: 0,correlation
count,6339.0
mean,0.097508
std,0.087867
min,3e-06
25%,0.024762
50%,0.074288
75%,0.148116
max,0.593863


In [None]:
selected = result[result['correlation'] >= 0.144583]
print(selected.shape)

(1633, 2)


In [None]:
selected.describe()

Unnamed: 0,correlation
count,1589.0
mean,0.220429
std,0.067821
min,0.144583
25%,0.168977
50%,0.19838
75%,0.257598
max,0.593863


## Select top k interaction features

In [None]:
corr_df = pd.read_csv("../data/7-FeatureSelection/Part1/correlations.csv")
print(corr_df.shape)
corr_df.head(2)

(6396, 2)


Unnamed: 0,feature,correlation
0,B_30&0&S_9,0.00524
1,B_30&0&R_2,0.298032


In [None]:
corr_df.describe()

Unnamed: 0,correlation
count,6343.0
mean,0.09504
std,0.087567
min,3e-06
25%,0.022481
50%,0.070691
75%,0.144583
max,0.593863


In [None]:
# sort the features by correlation
sorted_features = corr_df.sort_values(by='correlation', ascending=False).feature.tolist()
print(len(sorted_features))
print(sorted_features[:5])

6396
['B_30&0&P_2', 'B_30&0&B_2', 'B_30&0&B_18', 'B_30&0&B_33', 'D_120&0&B_2']


In [None]:
# save the names of the top features
def save_top(k: int, sorted_features: List[str]):
    with open(f'../data/7-FeatureSelection/Part1/top_{k}.json', 'w') as f:
        json.dump(sorted_features[:k], f)

In [None]:
save_top(1500, sorted_features)
save_top(1000, sorted_features)
save_top(500, sorted_features)

In [None]:
def filter_feature(k: int, data_type: str, sorted_features: List[str]):
    '''
    Keep only selected features in the data
    
    data_type: train or test
    '''
    print("Performing feature selection")
    selected = set(sorted_features[:k])
    # loop through dfs with interaction features
    for i in range(1, 17):
        # load data
        read_path = f"../data/6-interaction/{data_type}/{data_type}_interact{str(i)}.parquet"
        print(f"Processing {read_path}")
        df = pd.read_parquet(read_path, engine='pyarrow')
        # filter for selected features
        cols = set(df.columns.tolist())
        to_keep = list(selected.intersection(cols))
        to_keep = ['customer_ID'] + to_keep
        df = df[to_keep]
        print(f"Shape after filtering: {df.shape}")
        # print(df.iloc[0, :5])
        # save data
        write_path = f"../data/7-FeatureSelection/Part1/{data_type}/{str(k)}/{data_type}_interact{str(i)}.parquet"
        df.to_parquet(write_path, index=False)


In [None]:
filter_feature(1500, 'train', sorted_features)

Performing feature selection
Processing ../data/6-interaction/train/train_interact1.parquet
Shape after filtering: (458913, 117)
Processing ../data/6-interaction/train/train_interact2.parquet
Shape after filtering: (458913, 99)
Processing ../data/6-interaction/train/train_interact3.parquet
Shape after filtering: (458913, 55)
Processing ../data/6-interaction/train/train_interact4.parquet
Shape after filtering: (458913, 130)
Processing ../data/6-interaction/train/train_interact5.parquet
Shape after filtering: (458913, 62)
Processing ../data/6-interaction/train/train_interact6.parquet
Shape after filtering: (458913, 75)
Processing ../data/6-interaction/train/train_interact7.parquet
Shape after filtering: (458913, 77)
Processing ../data/6-interaction/train/train_interact8.parquet
Shape after filtering: (458913, 96)
Processing ../data/6-interaction/train/train_interact9.parquet
Shape after filtering: (458913, 72)
Processing ../data/6-interaction/train/train_interact10.parquet
Shape after fi

In [None]:
filter_feature(1500, 'test', sorted_features)

Performing feature selection
Processing ../data/6-interaction/test/test_interact1.parquet
Shape after filtering: (924621, 117)
Processing ../data/6-interaction/test/test_interact2.parquet
Shape after filtering: (924621, 99)
Processing ../data/6-interaction/test/test_interact3.parquet
Shape after filtering: (924621, 55)
Processing ../data/6-interaction/test/test_interact4.parquet
Shape after filtering: (924621, 130)
Processing ../data/6-interaction/test/test_interact5.parquet
Shape after filtering: (924621, 62)
Processing ../data/6-interaction/test/test_interact6.parquet
Shape after filtering: (924621, 75)
Processing ../data/6-interaction/test/test_interact7.parquet
Shape after filtering: (924621, 77)
Processing ../data/6-interaction/test/test_interact8.parquet
Shape after filtering: (924621, 96)
Processing ../data/6-interaction/test/test_interact9.parquet
Shape after filtering: (924621, 72)
Processing ../data/6-interaction/test/test_interact10.parquet
Shape after filtering: (924621, 11

In [4]:
# combine the train dataframes
train_combined = pd.read_parquet("../data/7-FeatureSelection/Part1/train/train_interact1.parquet", engine='pyarrow')
for i in range(2, 17):
    print(i)
    read_path = f"../data/7-FeatureSelection/Part1/train/train_interact{str(i)}.parquet"
    df = pd.read_parquet(read_path, engine='pyarrow')
    train_combined = train_combined.merge(df, on='customer_ID')
print(train_combined.shape)
train_combined.to_parquet("../data/7-FeatureSelection/Part1/train/train_interact_all.parquet", index=False)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
(458913, 1501)


In [5]:
# combine the test dataframes
test_combined = pd.read_parquet("../data/7-FeatureSelection/Part1/test/test_interact1.parquet", engine='pyarrow')
for i in range(2, 17):
    print(i)
    read_path = "../data/7-FeatureSelection/Part1/test/test_interact" + str(i) + ".parquet"
    df = pd.read_parquet(read_path, engine='pyarrow')
    test_combined = test_combined.merge(df, on='customer_ID')
print(test_combined.shape)
test_combined.to_parquet("../data/7-FeatureSelection/Part1/test/test_interact_all.parquet", index=False)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
(924621, 1501)
