In [28]:
import pandas as pd

train = pd.read_csv('../preprocessed/train_df.csv')
test = pd.read_csv('../preprocessed/test_df.csv')


In [29]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5352 entries, 0 to 5351
Data columns (total 41 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   list_price                             5352 non-null   float64
 1   price_reduced_amount                   5352 non-null   float64
 2   description.year_built                 5352 non-null   int64  
 3   description.baths_3qtr                 5352 non-null   float64
 4   description.sold_price                 5352 non-null   float64
 5   description.baths_full                 5352 non-null   float64
 6   description.baths_half                 5352 non-null   float64
 7   description.lot_sqft                   5352 non-null   float64
 8   description.sqft                       5352 non-null   float64
 9   description.baths                      5352 non-null   float64
 10  description.sub_type                   5352 non-null   object 
 11  desc

In [30]:
from sklearn.model_selection import train_test_split
x_train = train.drop(columns=['description.sold_price'])
y_train = train['description.sold_price']
x_test = test.drop(columns=['description.sold_price'])
y_test = test['description.sold_price']

In [4]:
print(y_train.mean())


385490.66129409015


In [31]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Define the pipeline

# Define feature types
numerical_features = x_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = x_train.select_dtypes(include=['object']).columns

# Define transformers
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
    ])

# Create pipeline
lr = Pipeline(steps=[('preprocessor', preprocessor),
                     ('model', Lasso())])

# Fit the pipeline
lr.fit(x_train, y_train)

# Evaluate the model
y_pred = lr.predict(x_test)
print(f"R^2 Score: {lr.score(x_test, y_test):.2f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")

R^2 Score: 0.17
Mean Squared Error: 555175104976.73
Mean Absolute Error: 70754.15
R^2 Score: 0.17


  model = cd_fast.enet_coordinate_descent(


In [34]:
from sklearn.svm import SVR
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly_features', PolynomialFeatures(degree=3)),
    ('lasso', SVR())
])

# Fit the pipeline
pipeline.fit(x_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(x_test)
print(f"R^2 Score: {pipeline.score(x_test, y_test):.2f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f}")
print(r2_score(y_test, y_pred))

In [6]:
from sklearn.ensemble import RandomForestRegressor
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly_features', PolynomialFeatures(degree=2)),
    ('rf', RandomForestRegressor())
])
# Fit the pipeline
pipeline.fit(x_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(x_test)
print(pipeline.score(x_test, y_test))
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

0.9629274875536467
6013558756.7249565
26822.03156024384
0.9629274875536467


In [7]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly_features', PolynomialFeatures(degree=3)),
    ('xgb', XGBRegressor())
])

# Define the parameter grid
params = {
    'xgb__n_estimators': [100, 200, 300],
    'xgb__max_depth': [3, 4, 5]
}

# Set up GridSearchCV with the correct scoring parameter
grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')

# Fit the pipeline
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the model using the best model
y_pred = best_model.predict(X_test)
print('Best Parameters:', best_params)
print(best_model.score(X_test, y_test))
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

Best Parameters: {'xgb__max_depth': 5, 'xgb__n_estimators': 300}
0.9696925277982238
MSE: 4916196740.561531
MAE: 15890.89727948471
R2: 0.9696925277982238
