In [86]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.calibration import LabelEncoder


In [87]:
# Load the training and testing datasets
train_data = pd.read_csv('../../2nd-Comp-Data/train.csv')
test_data = pd.read_csv('../../2nd-Comp-Data/test.csv')
test_data.drop('row ID', axis=1, inplace=True)
train_data.drop('sub_area', axis=1, inplace=True)
test_data.drop('sub_area', axis=1, inplace=True)
testOriginal = pd.read_csv('../../2nd-Comp-Data/test.csv')

# Separate features and target variable in the training data
X = train_data.drop('price_doc', axis=1)
y = train_data['price_doc']

X = X[['full_sq', 'floor', 'build_count_monolith', 'industrial_km', 'trc_sqm_500','mosque_count_500', 'leisure_count_500', 'office_sqm_1000', 'cafe_count_1000_price_high', 'leisure_count_1000', 'power_transmission_line_km', 'big_market_km', 'public_healthcare_km', 'workplaces_km', 'shopping_centers_raion', 'green_part_500']]
test_data = test_data[['full_sq', 'floor', 'build_count_monolith', 'industrial_km', 'trc_sqm_500','mosque_count_500', 'leisure_count_500', 'office_sqm_1000', 'cafe_count_1000_price_high', 'leisure_count_1000', 'power_transmission_line_km', 'big_market_km', 'public_healthcare_km', 'workplaces_km', 'shopping_centers_raion', 'green_part_500']]

In [88]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [89]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create transformers for numerical and categorical features
numerical_transformer = RobustScaler()
categorical_transformer = OneHotEncoder()

# Combine transformers into a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with preprocessor and Lasso Regression model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(random_state=42))
])

In [90]:
# Define alpha values for grid search
alphas = [0.1, 0.3, 0.6, 0.7, 0.9]

# Set up the parameter grid for grid search
param_grid = {'regressor__alpha': alphas}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the results of grid search
results_df = pd.DataFrame(grid_search.cv_results_)

# Print RMSE for each alpha value
for index, row in results_df.iterrows():
    alpha_value = row['param_regressor__alpha']
    rmse = sqrt(-row['mean_test_score'])
    print(f'Alpha: {alpha_value}, RMSE: {rmse}')

# Print the best alpha value
best_alpha = grid_search.best_params_['regressor__alpha']
print(f'Best Alpha Value: {best_alpha}')

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Predict the target variable on the validation set using the best model
y_pred = best_model.predict(X_val)

# Evaluate the model
rmse = sqrt(mean_squared_error(y_val, y_pred))
print(f'Root Mean Squared Error on validation set: {rmse}')

Alpha: 0.01, RMSE: 13717065.988925079
Alpha: 0.1, RMSE: 13717065.989138473
Alpha: 1, RMSE: 13717065.991200393
Alpha: 10, RMSE: 13717066.013597135
Alpha: 100, RMSE: 13717066.237570152
Best Alpha Value: 0.01
Root Mean Squared Error on validation set: 13774849.645387534


In [91]:
prevAcc = 13774849.645387534
currAcc = rmse

x = currAcc - prevAcc
if x < 0:
    print("Accuracy has improved")
    print("Current: OneHot, RobustScaler, Alpha = " + str(best_alpha))
else:
    print("Accuracy has not improved")
    print("Previous: OneHot, RobustScaler, Alpha = 0.01")

Accuracy has not improved
Previous: OneHot, MinMaxScaler


In [92]:
# Predictions on test data
test_predictions = best_model.predict(test_data)
result_df = pd.DataFrame({'row ID': testOriginal['row ID'], 'price_doc': test_predictions.flatten()})
result_df.to_csv('Day5.1.csv', index=False)
             
# Check the shape of the validation set
print("Total Features : " + str(X_val.shape[1]))
print("Lasso Regression, Alpha = " + str(best_alpha) + ", Random State = 42")

Total Features : 16
Lasso Regression, Alpha = 0.01, Random State = 42


Alpha: 0.01, RMSE: 13717065.98890476. OneHot, StandardScaling