In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew
from sklearn.decomposition import PCA
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from matplotlib import pyplot as plt
from sklearn.preprocessing import RobustScaler

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
train = pd.read_csv('/content/gdrive/MyDrive/train.csv')
test = pd.read_csv('/content/gdrive/MyDrive/test.csv')
testOriginal = pd.read_csv('/content/gdrive/MyDrive/test.csv')

In [4]:
train.shape

(181507, 272)

In [5]:
test.drop('row ID', axis=1, inplace=True)

<h3>SubArea Removal<h3>

In [6]:
test.drop('sub_area', axis=1, inplace=True)
train.drop('sub_area', axis=1, inplace=True)

<h1>Categorical To Numerical<h1>

<h3>OneHot<h3>

In [7]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

<h1>Working<h1>

In [8]:
X = train.loc[:, train.columns != 'price_doc']
y = train[['price_doc']]

# scaler = RobustScaler()
# X = scaler.fit_transform(X)
# test = scaler.fit_transform(test)

In [9]:
X.shape

(181507, 287)

In [10]:
test.shape

(77789, 287)

<h1>P-Value Selection<h1>

In [11]:
# # Convert scaled_df1 to a DataFrame without specifying columns
# scaled_df1_df = pd.DataFrame(scaled_df1)

X_p = X.astype(float)

# Add a constant term to the feature matrix
X_with_const = sm.add_constant(X_p)

# Fit a linear regression model
model = sm.OLS(y, X_with_const).fit()

# Get p-values for each feature
p_values = model.pvalues[1:]  # Exclude the constant term

# Set your desired threshold for p-value
threshold = 0.00001

# Filter features based on p-value
selected_features = p_values[p_values < threshold].index

# Display selected features
print("Selected Features:")
print(selected_features)
print(len(selected_features))

# Select columns in the DataFrame
X2 = X[selected_features]
test2 = test[selected_features]


pca = PCA(n_components=10)
principalComponents = pca.fit_transform(X2)
X2 = pd.DataFrame(data = principalComponents)

pca2 = PCA(n_components=10)
principalComponents = pca2.fit_transform(test2)
test2 = pd.DataFrame(data = principalComponents)

# X = X.astype(float)
# X = sm.add_constant(X)
# mod = sm.OLS(y, X)
# res = mod.fit()
# pvalues = res.pvalues
# columns = pvalues[pvalues<0.05].index
# print(columns)
# print(len(columns))

# # Select columns in the DataFrame
# X2 = X[columns]
# test2 = test[columns]

Selected Features:
Index(['full_sq', 'life_sq', 'floor', 'children_preschool',
       'preschool_education_centers_raion',
       'school_education_centers_top_20_raion', 'healthcare_centers_raion',
       'university_top_20_raion', 'male_f', '0_6_female',
       'build_count_monolith', 'raion_build_count_with_builddate_info',
       'build_count_1971-1995', 'build_count_after_1995', 'kindergarten_km',
       'green_zone_km', 'industrial_km', 'water_treatment_km', 'water_km',
       'mkad_km', 'sadovoe_km', 'big_road2_km', 'nuclear_reactor_km',
       'swim_pool_km', 'basketball_km', 'church_synagogue_km', 'catering_km',
       'green_part_500', 'prom_part_500', 'trc_sqm_500',
       'cafe_count_500_price_1000', 'cafe_count_500_price_1500',
       'cafe_count_500_price_4000', 'cafe_count_500_price_high',
       'mosque_count_500', 'leisure_count_500', 'market_count_500',
       'green_part_1000', 'prom_part_1000', 'office_count_1000',
       'office_sqm_1000', 'trc_count_1000', 'trc_sq

<h1> Poly Interaction ONNNNNNNNN<h1>

In [12]:
poly = PolynomialFeatures(2, interaction_only=True)
X3 = poly.fit_transform(X2)
test3 = poly.fit_transform(test2)




<h1>Applying Model<h1>

In [13]:
X3.shape

(181507, 56)

In [14]:
test3.shape

(77789, 56)

In [15]:
# Create and train the Lasso regression model
alpha_value = 0.5  # You can adjust the alpha parameter based on your needs
lasso_reg = Lasso(alpha=alpha_value, random_state=42)
lasso_reg.fit(X3, y)

# Print the coefficients and intercept
print("Coefficients:", lasso_reg.coef_)
print("Intercept:", lasso_reg.intercept_)

# Make predictions on the test data
y_pred_lasso = lasso_reg.predict(test3)

Coefficients: [ 0.00000000e+00  2.04880399e+01  1.13633811e+01  1.27261659e+01
  7.45317708e+00  8.45093196e+00  3.94049354e+02  1.31136058e+03
  3.52040677e+03  9.21865880e+03  1.66576127e+03 -3.29070488e-06
 -1.42749262e-06 -1.24308514e-06  3.27088433e-06  8.90981576e-05
 -4.26973996e-04 -1.01192162e-03 -3.91513477e-03  1.98391964e-04
 -1.16818013e-06 -3.39522859e-06 -3.44364718e-06  2.05502938e-04
  7.65124627e-05 -9.68707597e-04 -1.90007047e-03 -9.85217999e-04
 -6.57718370e-08 -1.07691015e-06  2.97189766e-05 -1.08142077e-04
 -5.56517452e-04 -2.52887970e-03  1.16301440e-03 -1.43178458e-05
 -8.50725604e-05 -3.33670022e-04 -8.13847789e-04 -1.90206969e-03
 -1.50890114e-04 -5.03432747e-04 -7.86699024e-04 -5.88970308e-04
 -2.25755480e-03  2.14332096e-03 -1.35823437e-01  1.08452721e-02
 -6.61536300e-02  5.62453819e-02 -2.21196683e-02 -1.30908527e-01
 -2.51099873e-02 -5.61119462e-01 -9.27728366e-02 -4.77807367e-01]
Intercept: [14751285.29380646]


  model = cd_fast.enet_coordinate_descent(


In [16]:
# Save the predictions to a CSV file
result_df_lasso = pd.DataFrame({'row ID': testOriginal['row ID'], 'price_doc': y_pred_lasso.flatten()})
result_df_lasso.to_csv('/content/gdrive/MyDrive/Day_6_Abdullah_Maqsood_24448_Submission_4.csv', index=False)
