In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [4]:
# Import the train and test data
df_X_train = pd.read_csv('X_train.csv').iloc[:, 1:]
df_y_train = pd.read_csv('y_train.csv').iloc[:, 1:]
df_X_test = pd.read_csv('X_test.csv')

X_train = df_X_train.to_numpy()
y_train = df_y_train.to_numpy()
X_test = df_X_test.to_numpy()
id_test = X_test[:, 0]
X_test = X_test[:, 1:]

print(X_train.shape, y_train.shape,X_test.shape)

(1212, 832) (1212, 1) (776, 832)


In [5]:

# Imput missing values using the mean of each column (basic : try to find more pertinent)

# imput missing values using the k-neighbors imputer (more advanced)
from sklearn.impute import KNNImputer


# Create the imputer object, with 10 neighbors
imputer = KNNImputer(n_neighbors=10, weights='distance')

# Fit the imputer object on the train data
imputer.fit(X_train)

# Impute the missing values on the train and test data
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

# Check that there is no more missing values    
print(np.isnan(X_train).sum(), np.isnan(X_test).sum())
print(X_train.shape, y_train.shape, X_test.shape)

0 0
(1212, 832) (1212, 1) (776, 832)


In [6]:
# Remove features with low variance
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.1)  # remove features with more than 80% variance
X_train = sel.fit_transform(X_train, y_train)
X_test = sel.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape)

(1212, 666) (1212, 1) (776, 666)


In [7]:
### Drop highly correlated features
import pandas as pd
import numpy as np

# Assuming that X_train is your ndarray and it only contains feature columns
df = pd.DataFrame(X_train)
correlation_matrix = df.corr()

# Create a set to hold the correlated columns
corr_columns = set()

# Iterate over the correlation matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        # If the correlation between the columns is high, add it to the set
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[i]
            corr_columns.add(colname)

# Get the indices of the relevant features
relevant_features = [df.columns.get_loc(c) for c in df.columns if c not in corr_columns]

X_train = X_train[:, relevant_features]
X_test = X_test[:, relevant_features]
# Print the relevant feature indices
print("Number of relevant features : ", X_train.shape[1])
print(relevant_features)


Number of relevant features :  631
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 185, 186, 187, 188, 189, 190, 192, 193, 194, 195, 196, 197, 198, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222

In [8]:
# Select the most relevant features
from sklearn.feature_selection import SelectKBest, f_regression

# Create the SelectKBest with the mutual info strategy
selector = SelectKBest(f_regression, k=90)

# Fit the object to the training data
selector.fit(X_train, y_train)

# Transform the data
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)
print(X_train.shape, X_test.shape)

(1212, 90) (776, 90)


  y = column_or_1d(y, warn=True)


In [9]:
from sklearn.ensemble import IsolationForest

# Define the Isolation Forest model
clf = IsolationForest(random_state=42, n_estimators=500)
# Fit the model
clf.fit(X_train)

# Predict the outliers
outliers = clf.predict(X_train)

# Remove the outliers
mask = outliers != -1
X_train, y_train = X_train[mask, :], y_train[mask]

print('Number of outliers removed: ', sum(outliers == -1))

Number of outliers removed:  65


In [10]:
# Scale the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# #Split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [12]:
X_train.shape, y_train.shape, X_val.shape, y_test.shape

((917, 90), (917, 1), (230, 90), (230, 1))

In [13]:
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor

# Define the base models
estimators = [
    ('rf', RandomForestRegressor(max_depth=None, min_samples_split=2, min_samples_leaf=1, n_estimators=500, n_jobs=-1, random_state=0, criterion='absolute_error')),
    ('et', ExtraTreesRegressor(max_depth=None, min_samples_split=2, min_samples_leaf=1, n_estimators=500, n_jobs=-1, random_state=42)),
    ('dt', DecisionTreeRegressor(max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=0)),
    ('et2', ExtraTreesRegressor(max_depth=None, min_samples_split=10, min_samples_leaf=9, n_estimators=300, n_jobs=-1, random_state=42)),
    ('rf2', RandomForestRegressor(max_depth=20, min_samples_split=2, min_samples_leaf=1, n_estimators=200, n_jobs=-1, random_state=0, criterion='absolute_error')),
    ('et3', ExtraTreesRegressor(max_depth=None, min_samples_split=2, min_samples_leaf=1, n_estimators=500, n_jobs=-1, random_state=42)),
    ('rf3', RandomForestRegressor(max_depth=20, min_samples_split=2, min_samples_leaf=1, n_estimators=200, n_jobs=-1, random_state=0, criterion='absolute_error')),
    ('dt2', DecisionTreeRegressor(max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=0)),
]

# Define the meta model

# Define the stacking regressor
reg = StackingRegressor(estimators=estimators, verbose=3, n_jobs=-1)

# Fit the model to the training data
reg.fit(X_train, y_train)

# Predict the test data
y_pred = reg.predict(X_test)

# Calculate the R2 score for training and validation data
pred_test = reg.predict(X_val)
training_test = reg.predict(X_train)

train_sc = r2_score(y_train, training_test)
val_sc = r2_score(y_test, pred_test)
train_sc

  y = column_or_1d(y, warn=True)


KeyboardInterrupt: 

In [None]:
# from sklearn.linear_model import ElasticNet
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
# from sklearn.model_selection import GridSearchCV

# # Create the ElasticNet model
# # model = ElasticNet(alpha=5, l1_ratio=1, random_state=0)
# # model = DecisionTreeRegressor(max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=0)
# model = RandomForestRegressor(max_depth=15, 
#                               min_samples_split=2, 
#                               min_samples_leaf=1, 
#                               n_estimators=200, 
#                               n_jobs=-1, 
#                               random_state=0,
#                               criterion='absolute_error',
#                               )
# # model = ExtraTreesRegressor(max_depth=None, 
# #                             min_samples_split=10, 
# #                             min_samples_leaf=9, 
# #                             n_estimators=3000, 
# #                             n_jobs=-1, 
# #                             random_state=42)
# # Fit the model to the training data

# model.fit(X_train, y_train)

# # Predict the test data
# y_pred = model.predict(X_test)

# pred_test = model.predict(X_val)
# training_test = model.predict(X_train)

# train_sc = r2_score(y_train, training_test)
# val_sc = r2_score(y_test, pred_test)
# train_sc, val_sc

In [None]:
# Use the model to predict the test data and output it to a file "out.csv"
y_out = reg.predict(X_test)
output = np.stack((id_test, y_out.flatten()), axis=-1)
df_out = pd.DataFrame(output, columns=["id", "y"])

df_out.to_csv("out_rg.csv", index=False)