In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
import os
import torch
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from keras.optimizers import SGD, Adam, Adadelta, RMSprop, Adagrad, Nadam, Ftrl
from keras.regularizers import l2
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
import statsmodels.api as sm

In [50]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [51]:
# Load the training data
train_data = pd.read_csv('/content/gdrive/MyDrive/train.csv')
test_data = pd.read_csv('/content/gdrive/MyDrive/test.csv')

used = []

# Extract features and target variable
X = train_data.drop('price_doc', axis=1)
y = train_data['price_doc']
X_test = test_data.drop(['row ID'], axis=1)

In [52]:
X = X.drop('sub_area', axis=1)
X_test = X_test.drop('sub_area', axis=1)
used.append('Removed sub_area')

In [53]:
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)
used.append('OneHot Encoding')

In [54]:
# from sklearn.calibration import LabelEncoder

# categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
# print("Train: Categorical columns:", categorical_columns)

# label_encoder = LabelEncoder()

# for column in categorical_columns:
#     X[column] = label_encoder.fit_transform(X[column])

# categorical_columns_test = X_test.select_dtypes(include=['object']).columns.tolist()
# print("Test: Categorical columns:", categorical_columns_test)

# label_encoder = LabelEncoder()

# for column in categorical_columns_test:
#     X_test[column] = label_encoder.fit_transform(X_test[column])

# used.append('Label Encoding')

In [55]:
# # drop all columns in X_train with dtypes object
# for col in X.columns:
#     if X[col].dtype == 'object':
#         X.drop(col, axis=1, inplace=True)

# # drop all columns in X_test with dtypes object
# for col in X_test.columns:
#     if X_test[col].dtype == 'object':
#         X_test.drop(col, axis=1, inplace=True)

# used.append("Removed Object Dtypes")

In [56]:
X = X.astype('float32')
X_test = X_test.astype('float32')
used.append("Converted All Columns To float32")


# Add a constant term to the feature matrix
X_with_const = sm.add_constant(X)

# Fit a linear regression model
model = sm.OLS(y, X_with_const).fit()

# Get p-values for each feature
p_values = model.pvalues[1:]  # Exclude the constant term

# Set your desired threshold for p-value
threshold = 0.00001

# Filter features based on p-value
selected_features = p_values[p_values < threshold].index

# Display selected features
print("Selected Features:")
print(selected_features)
print(len(selected_features))

# Select columns in the DataFrame
X = X[selected_features]
X_test = X_test[selected_features]

Selected Features:
Index(['full_sq', 'life_sq', 'floor', 'children_preschool',
       'preschool_education_centers_raion',
       'school_education_centers_top_20_raion', 'healthcare_centers_raion',
       'university_top_20_raion', 'male_f', '0_6_female',
       'build_count_monolith', 'raion_build_count_with_builddate_info',
       'build_count_1971-1995', 'build_count_after_1995', 'kindergarten_km',
       'green_zone_km', 'industrial_km', 'water_treatment_km', 'water_km',
       'mkad_km', 'sadovoe_km', 'big_road2_km', 'nuclear_reactor_km',
       'swim_pool_km', 'basketball_km', 'church_synagogue_km', 'catering_km',
       'green_part_500', 'prom_part_500', 'trc_sqm_500',
       'cafe_count_500_price_1000', 'cafe_count_500_price_1500',
       'cafe_count_500_price_4000', 'cafe_count_500_price_high',
       'mosque_count_500', 'leisure_count_500', 'market_count_500',
       'green_part_1000', 'prom_part_1000', 'office_count_1000',
       'office_sqm_1000', 'trc_count_1000', 'trc_sq

In [57]:
X = np.log1p(X)
X_test = np.log1p(X_test)
used.append('log Normalization')

In [58]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [59]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
used.append("StandardScaler")

In [60]:
X_train.shape

(127054, 87)

In [61]:
# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(2)#, interaction_only=True)
# X_train = poly.fit_transform(X_train)
# X_test = poly.fit_transform(X_test)
# used.append('PolynomialFeatures W/O Interaction')
# # used.append('PolynomialFeatures With Interaction')

In [62]:
# # save X_train to csv
# X_train.to_csv('train with poly w/o int.2.csv', index=False)
# X_test.to_csv('test with poly w/o int.csv', index=False)

In [63]:
# pca = PCA(n_components=200)
# principalComponents = pca.fit_transform(X_train)
# X_train = pd.DataFrame(data = principalComponents)

# pca2 = PCA(n_components=200)
# principalComponents = pca2.fit_transform(X_test)
# X_test = pd.DataFrame(data = principalComponents)

# used.append('PCA (n=200)')

<h1>With Keras<h1>

In [64]:
# features_used = X_train.shape[1]

# # Build the neural network
# os.environ["KERAS_BACKEND"] = "torch"
# used.append("Keras With Torch Backend")
# model = Sequential()
# model.add(Dense(360, input_dim=X_train.shape[1], activation='selu', kernel_regularizer=l2(0.01)))
# model.add(Dropout(0.3))  # You can use either dropout or early stopping
# model.add(Dense(280, activation='selu', kernel_regularizer=l2(0.01)))
# model.add(Dropout(0.3))
# model.add(Dense(190, activation='selu', kernel_regularizer=l2(0.01)))
# model.add(Dropout(0.3))
# model.add(Dense(130, activation='selu', kernel_regularizer=l2(0.01)))
# model.add(Dropout(0.3))
# model.add(Dense(1, activation='linear'))  # Output layer with linear activation for regression

# used.append("4 Hidden Layers: Hidden Layer 1 with 360 neurons selu activation L2 regularization, \n" +
#             "Hidden Layer 2 with 280 neurons selu activation L2 regularization, \n" +
#             "Hidden Layer 3 with 190 neurons selu activation L2 regularization, \n" +
#             "Hidden Layer 4 with 130 neurons selu activation L2 regularization")
# used.append("Output linear")

# # optimizerUsing = Adam(lr=0.05)#, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)
# # used.append("Optimizer: Adam(lr=0.05)")
# # optimizerUsing = RMSprop(lr=0.001)
# # used.append("Optimizer: RMSprop(lr=0.001)")
# # optimizerUsing = Adagrad(lr=0.01)
# # used.append("Optimizer: Adagrad(lr=0.01)")
# # optimizerUsing = Adadelta(lr=1.0, rho=0.95)
# # used.append("Optimizer: Adadelta(lr=1.0, rho=0.95)")
# # optimizerUsing = Nadam(lr=0.002, beta_1=0.9, beta_2=0.999)
# # used.append("Optimizer: Nadam(lr=0.002, beta_1=0.9, beta_2=0.999)")
# optimizerUsing = Ftrl(learning_rate=0.05)#, learning_rate_power=-0.5, initial_accumulator_value=0.1, l1_regularization_strength=0.01, l2_regularization_strength=0.01)
# used.append("Optimizer: Ftrl(learning_rate=0.05)")

# # Compile the model
# model.compile(loss='mean_squared_error', optimizer=optimizerUsing)
# used.append("Loss Calculation: Mean Squared Error")

# # Define early stopping
# early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# used.append("EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)")

# # Train the model
# model.fit(X_train, y_train, epochs=100, batch_size=1800, validation_data=(X_val, y_val), callbacks=[early_stopping])
# used.append("batch_size=1800, epochs=100, early_stopping (patience 10), dropout=0.3, L2 Regularization")

# # Train the model
# # model.fit(X_train, y_train, epochs=100, batch_size=1800, validation_data=(X_val, y_val))#, callbacks=[early_stopping])
# # used.append("batch_size=1800, epochs=100, dropout=0.3, L2 Regularization")

# # used.append("batch_size=1800, epochs=100, early_stopping")

# # Make predictions
# predictions = model.predict(X_test)

<h1>With Skilearn<h1>

In [65]:
features_used = X_train.shape[1]

# # optimizerUsing = SGD(learning_rate=0.01, momentum=0.9)

# # optimizerUsing = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)
# optimizerUsing = Adam(lr=0.001)#, epsilon=1e-07)

l2Regu = 0.001

# model = MLPRegressor(hidden_layer_sizes=(300,210,150,60), activation='relu', solver='adam', alpha = l2Regu, shuffle=False, max_iter=100,
#                      batch_size=1800, early_stopping=True, verbose=True, random_state=42)

# {'logistic', 'tanh', 'identity', 'relu'}. Got 'selu' instead.

model = MLPRegressor(hidden_layer_sizes=(250, 170, 100, 50), activation='relu', solver='adam', alpha = l2Regu, shuffle=False, max_iter=150,
                     batch_size=1500, early_stopping=True, verbose=True, random_state=42)

used.append("MLPRegressor")
used.append("4 Hidden Layers: Hidden Layer 1 with 300 neurons logistic activation L2 regularization, \n" +
            "Hidden Layer 2 with 210 neurons logistic activation L2 regularization, \n" +
            "Hidden Layer 3 with 150 neurons logistic activation L2 regularization, \n" +
            "Hidden Layer 4 with 60 neurons logistic activation L2 regularization")
used.append("Optimizer: adam")
used.append("batch_size=1800, epochs=100, early_stopping")

# Train the model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

Iteration 1, loss = 347248330119862.68750000
Validation score: -0.454508
Iteration 2, loss = 319485246335986.00000000
Validation score: -0.038569
Iteration 3, loss = 139884036273433.17187500
Validation score: 0.596278
Iteration 4, loss = 95204165461363.85937500
Validation score: 0.607305
Iteration 5, loss = 93559002815999.10937500
Validation score: 0.612642
Iteration 6, loss = 92510446486212.89062500
Validation score: 0.616388
Iteration 7, loss = 91728056533462.43750000
Validation score: 0.619351
Iteration 8, loss = 91088403565114.89062500
Validation score: 0.621807
Iteration 9, loss = 90547818285445.21875000
Validation score: 0.623919
Iteration 10, loss = 90077303925825.48437500
Validation score: 0.625747
Iteration 11, loss = 89667098184755.28125000
Validation score: 0.627351
Iteration 12, loss = 89300265150963.35937500
Validation score: 0.628793
Iteration 13, loss = 88968554635117.25000000
Validation score: 0.630087
Iteration 14, loss = 88667571303930.45312500
Validation score: 0.631

In [66]:
# Create a DataFrame with the results
submission_df = pd.DataFrame({'row ID': test_data['row ID'], 'price_doc': predictions.flatten()})

# Save the results to a CSV file
submission_df.to_csv('Day11.4.csv', index=False)

In [67]:
print("Features Used = " + str(features_used) + "\n")
for i in used:
    print(i)

Features Used = 87

Removed sub_area
OneHot Encoding
Converted All Columns To float32
log Normalization
StandardScaler
MLPRegressor
4 Hidden Layers: Hidden Layer 1 with 300 neurons logistic activation L2 regularization, 
Hidden Layer 2 with 210 neurons logistic activation L2 regularization, 
Hidden Layer 3 with 150 neurons logistic activation L2 regularization, 
Hidden Layer 4 with 60 neurons logistic activation L2 regularization
Optimizer: adam
batch_size=1800, epochs=100, early_stopping
