In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew
from sklearn.decomposition import PCA
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from matplotlib import pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PowerTransformer

In [2]:
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')
testOriginal = pd.read_csv('../test.csv')

In [3]:
train.shape

(181507, 272)

In [4]:
test.drop('row ID', axis=1, inplace=True)

<h3>SubArea Removal<h3>

In [5]:
test.drop('sub_area', axis=1, inplace=True)
train.drop('sub_area', axis=1, inplace=True)

<h1>Categorical To Numerical<h1>

<h3>Label<h3>

In [6]:
categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
print("Train: Categorical columns:", categorical_columns)

label_encoder = LabelEncoder()

for column in categorical_columns:
    train[column] = label_encoder.fit_transform(train[column])

categorical_columns_test = test.select_dtypes(include=['object']).columns.tolist()
print("Test: Categorical columns:", categorical_columns_test)

label_encoder = LabelEncoder()

for column in categorical_columns_test:
    test[column] = label_encoder.fit_transform(test[column])

Train: Categorical columns: ['product_type', 'culture_objects_top_25', 'thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion', 'water_1line', 'big_road1_1line', 'railroad_1line', 'ecology']
Test: Categorical columns: ['product_type', 'culture_objects_top_25', 'thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion', 'water_1line', 'big_road1_1line', 'railroad_1line', 'ecology']


<h1>Working<h1>

In [7]:
X = train.loc[:, train.columns != 'price_doc']
y = train[['price_doc']]

# scaler = RobustScaler()
# X = scaler.fit_transform(X)
# test = scaler.transform(test)  

In [8]:
X.shape

(181507, 270)

In [9]:
test.shape

(77789, 270)

<h1>Forward Feature Selection<h1>

In [10]:
# X2 = X[['full_sq', 'floor', 'build_count_monolith', 'industrial_km', 'trc_sqm_500',
#  'mosque_count_500', 'leisure_count_500', 'office_sqm_1000',
#  'cafe_count_1000_price_high', 'leisure_count_1000', 'power_transmission_line_km', 
#  'big_market_km', 'public_healthcare_km', 'workplaces_km']]
# test2 = test[['full_sq', 'floor', 'build_count_monolith', 'industrial_km', 'trc_sqm_500',
#  'mosque_count_500', 'leisure_count_500', 'office_sqm_1000',
#  'cafe_count_1000_price_high', 'leisure_count_1000', 'power_transmission_line_km', 
#  'big_market_km', 'public_healthcare_km', 'workplaces_km']]

<h1>Min_Max Scaling<h1>

In [11]:
scaler = MinMaxScaler()
X3 = scaler.fit_transform(X)
test3 = scaler.transform(test)  

pca = PCA(n_components=10)
principalComponents = pca.fit_transform(X3)
X3 = pd.DataFrame(data = principalComponents)

pca2 = PCA(n_components=10)
principalComponents = pca2.fit_transform(test3)
test3 = pd.DataFrame(data = principalComponents)


poly = PolynomialFeatures(2)
X3 = poly.fit_transform(X3)
test3 = poly.fit_transform(test3)


<h1>Applying Model<h1>

In [12]:
X3.shape

(181507, 66)

In [13]:
test3.shape

(77789, 66)

In [14]:



# Create and train the Lasso regression model
alpha_value = 0.5  # You can adjust the alpha parameter based on your needs
lasso_reg = Lasso(alpha=alpha_value, random_state=42)
lasso_reg.fit(X3, y)

# Print the coefficients and intercept
print("Coefficients:", lasso_reg.coef_)
print("Intercept:", lasso_reg.intercept_)

# Make predictions on the test data
y_pred_lasso = lasso_reg.predict(test3)

Coefficients: [       0.          9455603.63376393   774784.43002902  1141350.18898422
  1422967.40041577   704731.92360835  2621828.44123971  1421845.32718204
  -325721.73241431  1582445.15878778  -479693.37838902  -545089.87491694
   409108.90675451   -77719.3253505    -63183.68245765  -170416.92310457
  -571211.50210851  -248333.19151988    54334.63956282  -293927.49472745
   -59668.50309329 -1805518.3834154   1044598.89161014   222223.81267132
    26050.93694064 -2632738.7311184   -921426.6955648   1099315.8317451
   491472.37865629  -214817.08823468  -823232.94986946    42942.21321355
  -198085.96796254   695189.27904887 -1205685.01151691  -460643.58177586
   559964.21842871  -749047.77207432   282273.29611586 -1180045.03362587
  -826743.08443783 -2128130.79504345  -572741.77276779  1421292.30433163
  1341033.53951585  -956033.23828614  -606068.7550627  -2565240.82110917
   751079.49795276 -1463886.96762499  -648527.26644446  1360967.3180972
  1181528.6985459     78921.97455528  1

  model = cd_fast.enet_coordinate_descent(


In [15]:
# Save the predictions to a CSV file
result_df_lasso = pd.DataFrame({'row ID': testOriginal['row ID'], 'price_doc': y_pred_lasso.flatten()})
result_df_lasso.to_csv('Day_6_Abdullah_Maqsood_24448_Submission_8.csv', index=False)