In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew
from sklearn.decomposition import PCA
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from matplotlib import pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PowerTransformer

In [2]:
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')
testOriginal = pd.read_csv('../test.csv')

In [3]:
train.shape

(181507, 272)

In [4]:
test.drop('row ID', axis=1, inplace=True)

<h3>SubArea Removal<h3>

In [5]:
test.drop('sub_area', axis=1, inplace=True)
train.drop('sub_area', axis=1, inplace=True)

<h1>Categorical To Numerical<h1>

<h3>Label<h3>

In [6]:
categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
print("Train: Categorical columns:", categorical_columns)

label_encoder = LabelEncoder()

for column in categorical_columns:
    train[column] = label_encoder.fit_transform(train[column])

categorical_columns_test = test.select_dtypes(include=['object']).columns.tolist()
print("Test: Categorical columns:", categorical_columns_test)

label_encoder = LabelEncoder()

for column in categorical_columns_test:
    test[column] = label_encoder.fit_transform(test[column])

Train: Categorical columns: ['product_type', 'culture_objects_top_25', 'thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion', 'water_1line', 'big_road1_1line', 'railroad_1line', 'ecology']
Test: Categorical columns: ['product_type', 'culture_objects_top_25', 'thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion', 'water_1line', 'big_road1_1line', 'railroad_1line', 'ecology']


<h1>Working<h1>

In [7]:
X = train.loc[:, train.columns != 'price_doc']
y = train[['price_doc']]

# scaler = RobustScaler()
# X = scaler.fit_transform(X)
# test = scaler.transform(test)  

In [8]:
X.shape

(181507, 270)

In [9]:
test.shape

(77789, 270)

<h1>Forward Feature Selection<h1>

In [10]:
X2 = X[['full_sq', 'floor', 'build_count_monolith', 'industrial_km', 'trc_sqm_500',
 'mosque_count_500', 'leisure_count_500', 'office_sqm_1000',
 'cafe_count_1000_price_high', 'leisure_count_1000', 'power_transmission_line_km', 
 'big_market_km', 'public_healthcare_km', 'workplaces_km']]
test2 = test[['full_sq', 'floor', 'build_count_monolith', 'industrial_km', 'trc_sqm_500',
 'mosque_count_500', 'leisure_count_500', 'office_sqm_1000',
 'cafe_count_1000_price_high', 'leisure_count_1000', 'power_transmission_line_km', 
 'big_market_km', 'public_healthcare_km', 'workplaces_km']]

<h1>Min_Max Scaling<h1>

In [11]:
scaler = MinMaxScaler()
X3 = scaler.fit_transform(X2)
test3 = scaler.transform(test2)  

<h1>Applying Model<h1>

In [12]:
X3.shape

(181507, 14)

In [13]:
test3.shape

(77789, 14)

In [14]:
# Create and train the Lasso regression model
alpha_value = 0.5  # You can adjust the alpha parameter based on your needs
lasso_reg = Lasso(alpha=alpha_value, random_state=42)
lasso_reg.fit(X3, y)

# Print the coefficients and intercept
print("Coefficients:", lasso_reg.coef_)
print("Intercept:", lasso_reg.intercept_)

# Make predictions on the test data
y_pred_lasso = lasso_reg.predict(test3)

Coefficients: [12389088.23727123 10445971.73793267  6114900.59571621  9223722.98687602
  8738233.76238086  9935820.19913183 11265550.58418491  9872628.85559031
  9957047.39758478  9688872.07124209   233236.83319501 -1699143.27870131
  2318501.48502399  -633107.79863413]
Intercept: [4995388.48983339]


In [15]:
# Save the predictions to a CSV file
result_df_lasso = pd.DataFrame({'row ID': testOriginal['row ID'], 'price_doc': y_pred_lasso.flatten()})
result_df_lasso.to_csv('Day_6_Abdullah_Maqsood_24448_Submission_1.csv', index=False)