In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
import os
import torch
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from keras.optimizers import SGD, Adam, Adadelta, RMSprop, Adagrad, Nadam, Ftrl
from keras.regularizers import l2
import numpy as np
from sklearn.decomposition import PCA

In [2]:
# Load the training data
train_data = pd.read_csv('../../2nd-Comp-Data/train.csv')
test_data = pd.read_csv('../../2nd-Comp-Data/test.csv')

used = []

# Extract features and target variable
X = train_data.drop('price_doc', axis=1)
y = train_data['price_doc']
X_test = test_data.drop(['row ID'], axis=1)

In [3]:
X = X.drop('sub_area', axis=1)
X_test = X_test.drop('sub_area', axis=1)
used.append('Removed sub_area')

In [4]:
# X = pd.get_dummies(X)
# X_test = pd.get_dummies(X_test) 
# used.append('OneHot Encoding')

In [5]:
from sklearn.calibration import LabelEncoder

categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
print("Train: Categorical columns:", categorical_columns)

label_encoder = LabelEncoder()

for column in categorical_columns:
    X[column] = label_encoder.fit_transform(X[column])

categorical_columns_test = X_test.select_dtypes(include=['object']).columns.tolist()
print("Test: Categorical columns:", categorical_columns_test)

label_encoder = LabelEncoder()

for column in categorical_columns_test:
    X_test[column] = label_encoder.fit_transform(X_test[column])

used.append('Label Encoding')

Train: Categorical columns: ['product_type', 'culture_objects_top_25', 'thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion', 'water_1line', 'big_road1_1line', 'railroad_1line', 'ecology']
Test: Categorical columns: ['product_type', 'culture_objects_top_25', 'thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion', 'water_1line', 'big_road1_1line', 'railroad_1line', 'ecology']


In [6]:
# # drop all columns in X_train with dtypes object
# for col in X.columns:
#     if X[col].dtype == 'object':
#         X.drop(col, axis=1, inplace=True)

# # drop all columns in X_test with dtypes object
# for col in X_test.columns:
#     if X_test[col].dtype == 'object':
#         X_test.drop(col, axis=1, inplace=True)

# used.append("Removed Object Dtypes")

In [7]:
from sklearn.feature_selection import VarianceThreshold


threshold_value = 4000000
variance_filter = VarianceThreshold(threshold=threshold_value)

X = variance_filter.fit_transform(X)
X_test = variance_filter.transform(X_test)

used.append("Variance Based Feature Selection")

In [8]:
X = X.astype('float32')
X_test = X_test.astype('float32')
used.append("Converted All Columns To float32")

In [9]:
X = np.log1p(X)
X_test = np.log1p(X_test)
used.append('log Normalization')

In [10]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
used.append("StandardScaler")

In [12]:
X_train.shape

(127054, 41)

In [13]:
# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(2)#, interaction_only=True)
# X_train = poly.fit_transform(X_train)
# X_test = poly.fit_transform(X_test)
# used.append('PolynomialFeatures W/O Interaction')
# # used.append('PolynomialFeatures With Interaction')

In [14]:
# # save X_train to csv
# X_train.to_csv('train with poly w/o int.2.csv', index=False)
# X_test.to_csv('test with poly w/o int.csv', index=False)

In [15]:
# pca = PCA(n_components=200)
# principalComponents = pca.fit_transform(X_train)
# X_train = pd.DataFrame(data = principalComponents)

# pca2 = PCA(n_components=200)
# principalComponents = pca2.fit_transform(X_test)
# X_test = pd.DataFrame(data = principalComponents)

# used.append('PCA (n=200)')

<h1>With Keras<h1>

In [16]:
features_used = X_train.shape[1]

# Build the neural network
os.environ["KERAS_BACKEND"] = "torch"
used.append("Keras With Torch Backend")
model = Sequential()
model.add(Dense(360, input_dim=X_train.shape[1], activation='selu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))  # You can use either dropout or early stopping
model.add(Dense(280, activation='selu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(190, activation='selu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(130, activation='selu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='linear'))  # Output layer with linear activation for regression

used.append("4 Hidden Layers: Hidden Layer 1 with 360 neurons selu activation L2 regularization, \n" +
            "Hidden Layer 2 with 280 neurons selu activation L2 regularization, \n" +
            "Hidden Layer 3 with 190 neurons selu activation L2 regularization, \n" +
            "Hidden Layer 4 with 130 neurons selu activation L2 regularization")
used.append("Output linear")

# optimizerUsing = Adam(lr=0.05)#, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)
# used.append("Optimizer: Adam(lr=0.05)")
# optimizerUsing = RMSprop(lr=0.001)
# used.append("Optimizer: RMSprop(lr=0.001)")
# optimizerUsing = Adagrad(lr=0.01)
# used.append("Optimizer: Adagrad(lr=0.01)")
# optimizerUsing = Adadelta(lr=1.0, rho=0.95)
# used.append("Optimizer: Adadelta(lr=1.0, rho=0.95)")
# optimizerUsing = Nadam(lr=0.002, beta_1=0.9, beta_2=0.999)
# used.append("Optimizer: Nadam(lr=0.002, beta_1=0.9, beta_2=0.999)")
optimizerUsing = Ftrl(learning_rate=0.4)#, learning_rate_power=-0.5, initial_accumulator_value=0.1, l1_regularization_strength=0.01, l2_regularization_strength=0.01)
used.append("Optimizer: Ftrl(learning_rate=0.4)")

# Compile the model
model.compile(loss='mean_squared_error', optimizer=optimizerUsing)
used.append("Loss Calculation: Mean Squared Error")

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
used.append("EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)")

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=1800, validation_data=(X_val, y_val), callbacks=[early_stopping])
used.append("batch_size=1800, epochs=100, early_stopping (patience 10), dropout=0.3, L2 Regularization")

# Train the model
# model.fit(X_train, y_train, epochs=100, batch_size=1800, validation_data=(X_val, y_val))#, callbacks=[early_stopping])
# used.append("batch_size=1800, epochs=100, dropout=0.3, L2 Regularization")

# used.append("batch_size=1800, epochs=100, early_stopping")

# Make predictions
predictions = model.predict(X_test)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


<h1>With Skilearn<h1>

In [17]:
# features_used = X_train.shape[1]

# # # optimizerUsing = SGD(learning_rate=0.01, momentum=0.9)

# # # optimizerUsing = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)
# # optimizerUsing = Adam(lr=0.001)#, epsilon=1e-07)

# l2Regu = 0.001

# # model = MLPRegressor(hidden_layer_sizes=(300,210,150,60), activation='relu', solver='adam', alpha = l2Regu, shuffle=False, max_iter=100,
# #                      batch_size=1800, early_stopping=True, verbose=True, random_state=42)

# # {'logistic', 'tanh', 'identity', 'relu'}. Got 'selu' instead.

# model = MLPRegressor(hidden_layer_sizes=(300,210,150,60), activation='relu', solver='adam', alpha = l2Regu, shuffle=False, max_iter=100,
#                      batch_size=1800, early_stopping=True, verbose=True, random_state=42)

# used.append("MLPRegressor")
# used.append("4 Hidden Layers: Hidden Layer 1 with 300 neurons relu activation L2 regularization, \n" +
#             "Hidden Layer 2 with 210 neurons relu activation L2 regularization, \n" +
#             "Hidden Layer 3 with 150 neurons relu activation L2 regularization, \n" +
#             "Hidden Layer 4 with 60 neurons relu activation L2 regularization")
# used.append("Optimizer: adam")
# used.append("batch_size=1800, epochs=100, early_stopping")

# # Train the model
# model.fit(X_train, y_train)

# # Make predictions
# predictions = model.predict(X_test)

In [18]:
# Create a DataFrame with the results
submission_df = pd.DataFrame({'row ID': test_data['row ID'], 'price_doc': predictions.flatten()})

# Save the results to a CSV file
submission_df.to_csv('Day12.2.csv', index=False)

In [19]:
print("Features Used = " + str(features_used) + "\n")
for i in used:
    print(i)

Features Used = 41

Removed sub_area
Label Encoding
Variance Based Feature Selection
Converted All Columns To float32
log Normalization
StandardScaler
Keras With Torch Backend
4 Hidden Layers: Hidden Layer 1 with 360 neurons selu activation L2 regularization, 
Hidden Layer 2 with 280 neurons selu activation L2 regularization, 
Hidden Layer 3 with 190 neurons selu activation L2 regularization, 
Hidden Layer 4 with 130 neurons selu activation L2 regularization
Output linear
Optimizer: Ftrl(learning_rate=0.4)
Loss Calculation: Mean Squared Error
EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
batch_size=1800, epochs=100, early_stopping (patience 10), dropout=0.3, L2 Regularization
