In [1]:
import pandas as pd

X_train = pd.read_csv('features/X_train_features.csv')
y_train = pd.read_csv('public/y_train.csv', index_col='id')

In [4]:
import numpy as np
# Imput missing values using the mean of each column (basic : try to find more pertinent)

# imput missing values using the k-neighbors imputer (more advanced)
from sklearn.impute import KNNImputer


# Create the imputer object, with 10 neighbors
imputer = KNNImputer(n_neighbors=10, weights='distance')

# Fit the imputer object on the train data
imputer.fit(X_train)

# Impute the missing values on the train and test data
X_train = imputer.transform(X_train)

# Check that there is no more missing values    
print(np.isnan(X_train).sum())
print(X_train.shape, y_train.shape)

0
(5117, 2247) (5117, 2)


In [7]:
# Remove features with low variance
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.1)  # remove features with more than 80% variance
X_train = sel.fit_transform(X_train, y_train)
print(X_train.shape, y_train.shape)

(5117, 1813) (5117, 2)


In [8]:
### Drop highly correlated features
import pandas as pd
import numpy as np

# Assuming that X_train is your ndarray and it only contains feature columns
df = pd.DataFrame(X_train)
correlation_matrix = df.corr()

# Create a set to hold the correlated columns
corr_columns = set()

# Iterate over the correlation matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        # If the correlation between the columns is high, add it to the set
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[i]
            corr_columns.add(colname)

# Get the indices of the relevant features
relevant_features = [df.columns.get_loc(c) for c in df.columns if c not in corr_columns]

X_train = X_train[:, relevant_features]
# Print the relevant feature indices
print("Number of relevant features : ", X_train.shape[1])
print(relevant_features)


Number of relevant features :  685
[0, 1, 8, 9, 15, 17, 19, 20, 22, 23, 24, 25, 26, 28, 29, 30, 37, 40, 41, 44, 45, 47, 54, 57, 58, 59, 62, 64, 65, 72, 75, 76, 80, 82, 84, 86, 88, 89, 92, 93, 94, 97, 118, 120, 122, 123, 126, 130, 132, 135, 153, 160, 164, 167, 169, 175, 186, 187, 188, 191, 200, 203, 204, 206, 210, 213, 217, 223, 227, 234, 240, 244, 247, 257, 261, 268, 273, 274, 276, 278, 281, 285, 291, 294, 296, 298, 303, 304, 307, 309, 311, 315, 316, 317, 321, 322, 323, 326, 329, 330, 331, 332, 333, 334, 340, 341, 344, 345, 346, 347, 349, 354, 356, 357, 358, 359, 360, 361, 362, 364, 365, 366, 367, 368, 374, 375, 377, 378, 381, 387, 389, 390, 391, 392, 393, 396, 402, 404, 405, 410, 416, 424, 430, 432, 440, 444, 452, 456, 460, 468, 469, 470, 471, 472, 474, 475, 476, 478, 489, 490, 491, 492, 493, 495, 496, 510, 511, 512, 514, 515, 516, 518, 519, 520, 522, 523, 529, 530, 531, 533, 534, 537, 538, 541, 542, 549, 550, 551, 553, 559, 562, 570, 576, 577, 588, 590, 597, 598, 600, 609, 610, 612, 

In [15]:
y_train = pd.read_csv('public/y_train.csv', index_col='id')
y_train = y_train.to_numpy().ravel()

In [17]:
# Select the most relevant features
from sklearn.feature_selection import SelectKBest, f_regression

# Create the SelectKBest with the mutual info strategy
selector = SelectKBest(f_regression, k=300)

# Fit the object to the training data
selector.fit(X_train, y_train)

# Transform the data
X_train = selector.transform(X_train)
print(X_train.shape)

(5117, 300)


In [18]:
y_df = pd.read_csv('public/y_train.csv', index_col='id')

In [20]:
X_train_df = pd.DataFrame(X_train, index=y_df.index)
X_train_df.to_csv('features/X_train_features.csv')