In [15]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.decomposition import PCA

## Load datasets

##### artificial_train_x

In [19]:
artificial_train_x = pd.read_csv("datasets/artificial_train.data", sep = " ", header=None).iloc[:, 0:500]
artificial_train_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,485,477,537,479,452,471,491,476,475,473,...,477,481,477,485,511,485,481,479,475,496
1,483,458,460,487,587,475,526,479,485,469,...,463,478,487,338,513,486,483,492,510,517
2,487,542,499,468,448,471,442,478,480,477,...,487,481,492,650,506,501,480,489,499,498
3,480,491,510,485,495,472,417,474,502,476,...,491,480,474,572,454,469,475,482,494,461
4,484,502,528,489,466,481,402,478,487,468,...,488,479,452,435,486,508,481,504,495,511


##### artificial_train_y

In [20]:
artificial_train_y = pd.read_csv("datasets/artificial_train.labels", sep = " ", header=None).iloc[:, 0]
artificial_train_y.head()

0   -1
1   -1
2   -1
3    1
4    1
Name: 0, dtype: int64

##### artificial_valid_x

In [21]:
artificial_valid_x = pd.read_csv("datasets/artificial_valid.data", sep = " ", header=None).iloc[:, 0:500]
artificial_valid_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,483,454,513,495,523,469,453,477,506,479,...,455,480,543,259,413,520,485,498,523,510
1,485,508,493,487,478,472,504,476,479,475,...,486,480,535,534,514,452,484,495,548,477
2,483,521,507,475,493,486,421,475,496,483,...,491,476,498,495,508,528,486,465,508,503
3,474,504,576,480,553,483,524,478,483,483,...,521,475,470,463,509,525,479,467,552,517
4,495,474,523,479,495,488,485,476,497,478,...,510,471,522,343,509,520,475,493,506,491


##### sms_train

In [22]:
df_tmp = pd.read_csv("datasets/sms_train.csv")
sms_train_x = df_tmp.loc[:, "message"]
sms_train_x.head()

0      I dont. Can you send it to me. Plus how's mode.
1    Or i go home first lar ü wait 4 me lor.. I put...
2                             Me, i dont know again oh
3                            I'll see, but prolly yeah
4    Night has ended for another day, morning has c...
Name: message, dtype: object

In [23]:
sms_train_y = df_tmp.loc[:, "label"]
sms_train_y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

##### sms_test

In [24]:
df_tmp = pd.read_csv("datasets/sms_test.csv")
sms_test_x = df_tmp.loc[:, "message"]
sms_test_x.head()

0                Yo, you at jp and hungry like a mofo?
1    It's é only $140 ard...É rest all ard $180 at ...
2     &lt;#&gt; , that's all? Guess that's easy enough
3    Y?WHERE U AT DOGBREATH? ITS JUST SOUNDING LIKE...
4    Good afternoon sexy buns! How goes the job sea...
Name: message, dtype: object

## Feature selection - artificial

In [28]:
# I copied it from DEV project

def variance_threshold(X_train, t):
    print("variance_threshold")

    sel = VarianceThreshold(threshold=t)
    sel.fit_transform(X_train)
    
    return X_train.loc[:, sel.get_support()]


def mean_absolute_deviance(X_train, t):
    print("mean_absolute_deviance")

    mad = np.sum(np.abs(X_train - np.mean(X_train, axis=0)), axis=0) / X_train.shape[0]
    return X_train.loc[:, mad > t] 


def high_correlation(X_train, t):
    print("high_correlation")
    
    corr_matrix = X_train.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > t)]
    X_train.drop(to_drop, axis=1, inplace=True)
    return X_train


def information_gain(X_train, y_train, t):
    print("information_gain")

    importance = mutual_info_classif(X_train, y_train)
    return X_train.loc[:, importance > np.quantile(importance, t)]


def fisher_score(X_train, y_train, t):
    print("fisher_score")
    chi2_selector = SelectKBest(chi2, k=t)
    chi2_selector.fit(X_train, y_train)

    return X_train.loc[:, chi2_selector.get_support()]


# def forward_feature_selection(X_train, y_train, t):
#     print("forward_features_selection")
    
#     model = XGBClassifier()
#     sfs = SequentialFeatureSelector(model, n_features_to_select=t)
#     sfs.fit(X_train, y_train)
#     return X_train.loc[:, sfs.get_support()]

def pca(X_train, y_train, X_test, y_test, t):
    print("pca")
    X_test = X_test.loc[:, X_train.columns] 

    pca = PCA(n_components=t)
    pca.fit(X_train, y_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    return pd.DataFrame(X_train), pd.DataFrame(X_test)

In [29]:
def feature_selection_funnel(settings, X_train, X_test, y_train, y_test):
    # Feature selection
    print(f"Features initial: {X_train.shape[1]}\n")

    ## Methods to remove unnecessary features
    X_train_t = variance_threshold(X_train, settings["variance_threshold"])
    print(f"Features left: {X_train_t.shape[1]}\n")

    X_train_t = mean_absolute_deviance(X_train_t, settings["mean_absolute_deviance"])
    print(f"Features left: {X_train_t.shape[1]}\n")

    X_train_t = high_correlation(X_train_t, settings["high_correlation"])
    print(f"Features left: {X_train_t.shape[1]}\n")

    ## Methods to take important features
    X_train_t = information_gain(X_train_t, y_train, settings["information_gain"])
    print(f"Features left: {X_train_t.shape[1]}\n")

    X_train_t = fisher_score(X_train_t, y_train, settings["fisher_score"])
    print(f"Features left: {X_train_t.shape[1]}\n")

    X_train_t = forward_feature_selection(X_train_t, y_train, settings["forward_feature_selection"])
    print(f"Features left: {X_train_t.shape[1]}\n")

    ## Methods to create new features
    X_train_t, X_test_t = pca(X_train_t, y_train, X_test, y_test, settings["pca"])
    print(f"Features left: {X_train_t.shape[1]}\n")

    # Evaluation
    model = XGBClassifier()
    model.fit(X_train_t, y_train)

    y_train_pred = model.predict(X_train_t)
    print(f"Accuracy train: {accuracy_score(y_train, y_train_pred)}")

    y_test_pred = model.predict(X_test_t)
    print(f"Accuracy test: {accuracy_score(y_test, y_test_pred)}")
    return X_train_t, X_test_t