In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_scorehh

In [2]:
ames = pd.read_csv("C:/Users/mehrp/OneDrive/Documents/GitHub/PAs_7.1-7.2/AmesHousing.csv")

In [3]:
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [5]:
lr = LinearRegression()

X = ames[["Gr Liv Area", "TotRms AbvGrd", "Bldg Type", "Lot Area"]]
y = ames["SalePrice"]

#Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

#categorical and numerical columns
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

#column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)


model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])


model.fit(X_train, y_train)


In [6]:
#using size and number of rooms
pipeline1 = Pipeline([
    ('scaling', StandardScaler()),
    ('regression', LinearRegression())
]).set_output(transform="pandas")

# Fit the pipeline
pipeline1.fit(X_train[['Gr Liv Area', 'TotRms AbvGrd']], y_train)


prediction1 = pipeline1.predict(X_test[['Gr Liv Area', 'TotRms AbvGrd']])
mean_prediction1 = np.mean(prediction1)
mean_prediction1

184472.33179113804

In [7]:

rmse1 = mean_squared_error(y_test, prediction1, squared=False)
rmse1

55372.45300785066

In [8]:
#using size, number of rooms, and building type
pipeline2 = Pipeline([
    ('preprocess', ColumnTransformer([
        ('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd']),
        ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])
    ],
    remainder = "drop")),
    ('regression', LinearRegression())

])


pipeline2.fit(X_train[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']], y_train)


prediction2 = pipeline2.predict(X_test[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']])
mean_prediction1 = np.mean(prediction2)
mean_prediction1

185603.771119978

In [9]:

rmse2 = mean_squared_error(y_test, prediction2, squared=False)
rmse2

54083.12550273998

In [10]:
r_squared = pipeline2.score(X_test[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']], y_test)
r_squared

0.5512204934420772

In [11]:
pipeline3 = Pipeline([
    ('preprocess', ColumnTransformer([
        ('num', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True), ['Gr Liv Area']),
        ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])
    ])),
    ('regression', LinearRegression())
])


pipeline3.fit(X_train[['Gr Liv Area', 'Bldg Type']], y_train)


prediction3 = pipeline3.predict(X_test[['Gr Liv Area', 'Bldg Type']])
mean_prediction3 = np.mean(prediction2)
mean_prediction3

185603.771119978

In [12]:
rmse3 = mean_squared_error(y_test, prediction3, squared=False)
rmse3

54195.4480601071

In [13]:
pipeline4 = Pipeline([
    ('preprocess', ColumnTransformer([
        ('size_poly', PolynomialFeatures(degree=5), ['Gr Liv Area']),
        ('rooms_poly', PolynomialFeatures(degree=5), ['TotRms AbvGrd']),
        ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])
    ])),
    ('regression', LinearRegression())
])


pipeline4.fit(X_train[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']], y_train)


prediction4 = pipeline4.predict(X_test[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']])
mean_prediction4 = np.mean(prediction2)
mean_prediction4

185603.771119978

In [14]:
rmse4 = mean_squared_error(y_test, prediction4, squared=False)
rmse4

53218.39380275462

Best model based on RMSE is the model using size, number of rooms, and building type because its RMSE is the lowest.

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

rmse_scorer = make_scorer(mean_squared_error, squared=False)

scores = cross_val_score(pipeline1, X, y, cv=5, scoring=rmse_scorer)
print(scores)

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\base.py", line 918, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\preprocessing\_data.py", line 837, in fit
    return self.partial_fit(X, y, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\preprocessing\_data.py", line 873, in partial_fit
    X = self._validate_data(
        ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\base.py", line 604, in _validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\sklearn\utils\_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mehrp\anaconda3\Lib\site-packages\pandas\core\generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '1Fam'


In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

rmse_scorer = make_scorer(mean_squared_error, squared=False)

scores = cross_val_score(pipeline2, X, y, cv=5, scoring=rmse_scorer)
print(scores)

[59447.5945622  51677.04316677 57660.55257913 54423.45405505
 47631.76278281]


In [20]:
scores.mean()

54168.08142919383

In [22]:
rmse_scorer = make_scorer(mean_squared_error, squared=False)

scores = cross_val_score(pipeline3, X, y, cv=5, scoring=rmse_scorer)
print(scores)

[59148.73696256 52141.93955263 57470.44206793 54616.25407089
 48345.40142339]


In [23]:
scores.mean()

54344.55481548086

In [24]:
rmse_scorer = make_scorer(mean_squared_error, squared=False)

scores = cross_val_score(pipeline4, X, y, cv=5, scoring=rmse_scorer)
print(scores)

[61412.84577833 53487.89959464 62977.65732549 55348.05302977
 48289.98763143]


My first model failed, but out of the ones that worked, model2 seemes to a the better performer becuause of the lower RMSE. 

In [25]:
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('size_poly', PolynomialFeatures(), ['Gr Liv Area']),
        ('rooms_poly', PolynomialFeatures(), ['TotRms AbvGrd']),
        ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])
    ])),
    ('regression', LinearRegression())
])


param_grid = {
    'preprocessor__size_poly__degree': range(1, 11),
    'preprocessor__rooms_poly__degree': range(1, 11),
}

# Set up the grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search
gscv_fitted = grid_search.fit(X, y)
gscv_fitted.cv_results_

{'mean_fit_time': array([0.01165509, 0.01213984, 0.00987077, 0.01325197, 0.0163713 ,
        0.02156954, 0.01510334, 0.03244987, 0.01397824, 0.02460175,
        0.01450629, 0.0135118 , 0.02025023, 0.01348372, 0.01569519,
        0.01870985, 0.01587505, 0.01686139, 0.01478243, 0.01403732,
        0.01318612, 0.01286812, 0.01548176, 0.01712899, 0.01451483,
        0.01384301, 0.01396079, 0.01471868, 0.01358695, 0.01399126,
        0.01399388, 0.01374207, 0.01386628, 0.01343794, 0.01268559,
        0.01383762, 0.0133635 , 0.01325936, 0.01368666, 0.01631565,
        0.01454253, 0.01400237, 0.01380544, 0.0127573 , 0.01293497,
        0.01334314, 0.01380792, 0.01401443, 0.01362777, 0.01541443,
        0.01958742, 0.01766515, 0.0108357 , 0.01412511, 0.01310768,
        0.01103911, 0.01038456, 0.01168098, 0.01214595, 0.01304426,
        0.01270328, 0.01457095, 0.01456552, 0.01445832, 0.01425719,
        0.01405468, 0.01643071, 0.0174511 , 0.01626658, 0.01785421,
        0.02184963, 0.02115278,

In [26]:
gscv_fitted.cv_results_['mean_test_score']

array([-2.95199396e+09, -2.92175413e+09, -2.79802096e+09, -2.84489892e+09,
       -3.43303003e+09, -4.12239325e+09, -5.93838046e+09, -1.21074255e+10,
       -3.49276540e+10, -1.10418635e+11, -2.95776263e+09, -2.94657669e+09,
       -2.80530745e+09, -2.84386984e+09, -3.43303007e+09, -4.12239325e+09,
       -5.93838046e+09, -1.21074255e+10, -3.49276540e+10, -1.10418635e+11,
       -2.93244233e+09, -2.94118072e+09, -2.82269740e+09, -2.84314795e+09,
       -3.12013546e+09, -4.12239325e+09, -5.93838046e+09, -1.21074255e+10,
       -3.49276540e+10, -1.10418635e+11, -2.89181129e+09, -2.92746336e+09,
       -2.84066759e+09, -2.80276247e+09, -3.17247052e+09, -4.12239325e+09,
       -5.93838046e+09, -1.21074255e+10, -3.49276540e+10, -1.10418780e+11,
       -2.89418084e+09, -2.96019306e+09, -2.86465615e+09, -2.80473018e+09,
       -3.19880165e+09, -4.12239324e+09, -5.93838046e+09, -1.21074255e+10,
       -3.49276395e+10, -1.10418780e+11, -2.93650960e+09, -2.93980364e+09,
       -2.87504737e+09, -

In [27]:
df = pd.DataFrame(data = {
    "degrees": np.arange(1, 101),
    "scores": gscv_fitted.cv_results_['mean_test_score']
})

sorted_df = df.sort_values(by="scores", ascending=True)

sorted_df = sorted_df.reset_index(drop=True)

sorted_df

Unnamed: 0,degrees,scores
0,93,-2.707161e+12
1,92,-1.298922e+12
2,91,-1.263109e+12
3,60,-1.104188e+11
4,50,-1.104188e+11
...,...,...
95,64,-2.819039e+09
96,13,-2.805307e+09
97,44,-2.804730e+09
98,34,-2.802762e+09


Question1:

The 93rd model based on the cross-validation metrics. 

Question2:

From my reserch, I would say that the time, high cost and the computing invovled are the downsides. Bayesian optimitziation and regulaziation are some methods to become more efficient when choosing smaller numbers. 