In [None]:
import pandas as pd

X_train = pd.read_csv('features/X_train_features.csv')
y_train = pd.read_csv('y_train.csv', index_col='id')

In [None]:
import numpy as np
# Imput missing values using the mean of each column (basic : try to find more pertinent)

# imput missing values using the k-neighbors imputer (more advanced)
from sklearn.impute import KNNImputer


# Create the imputer object, with 10 neighbors
imputer = KNNImputer(n_neighbors=10, weights='distance')

# Fit the imputer object on the train data
imputer.fit(X_train)

# Impute the missing values on the train and test data
X_train = imputer.transform(X_train)

# Check that there is no more missing values    
print(np.isnan(X_train).sum())
print(X_train.shape, y_train.shape)

In [None]:
# Remove features with low variance
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.1)  # remove features with more than 80% variance
X_train = sel.fit_transform(X_train, y_train)
print(X_train.shape, y_train.shape)

In [None]:
### Drop highly correlated features
import pandas as pd
import numpy as np

# Assuming that X_train is your ndarray and it only contains feature columns
df = pd.DataFrame(X_train)
correlation_matrix = df.corr()

# Create a set to hold the correlated columns
corr_columns = set()

# Iterate over the correlation matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        # If the correlation between the columns is high, add it to the set
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[i]
            corr_columns.add(colname)

# Get the indices of the relevant features
relevant_features = [df.columns.get_loc(c) for c in df.columns if c not in corr_columns]

X_train = X_train[:, relevant_features]
# Print the relevant feature indices
print("Number of relevant features : ", X_train.shape[1])
print(relevant_features)


In [None]:
y_train = y_train.to_numpy().ravel()

In [None]:
# Select the most relevant features
from sklearn.feature_selection import SelectKBest, f_regression

# Create the SelectKBest with the mutual info strategy
selector = SelectKBest(f_regression, k=300)

# Fit the object to the training data
selector.fit(X_train, y_train)

# Transform the data
X_train = selector.transform(X_train)
print(X_train.shape)

In [None]:
y_df = pd.DataFrame(y_train, columns=['target'])
y_df.to_csv('public/y_train.csv', index_label='id')

In [None]:
X_train_df = pd.DataFrame(X_train, index=y_df.index)
X_train_df.to_csv('features/X_train_features.csv')