In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
import seaborn as sns

In [None]:
url = 'https://raw.githubusercontent.com/digipodium/Datasets/main/regression/house_pricing.csv'
df = pd.read_csv(url)
df.head()

Future analysis and selection

In [None]:
#relation between price and area
plt.scatter(df.SquareFeet, df.Price)

Check correlation between features and target when using linear models
- features are numeric
- predictions are numerical

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(df[['SquareFeet']], df.Price)
pred = model.predict(df[['SquareFeet']])
plt.scatter(df.SquareFeet, df.Price)
plt.plot(df.SquareFeet, pred, color='red')
#pearson correlation
pc= df.SquareFeet.corr(df.Price)
print('Pearson Correlation:', pc)

In [None]:
df.select_dtypes(include='number').corrwith(df.Price)

Pearson correlation coefficient
- if the value is close to 1, strong positive correlation
- if the value is close to -1, strong negative correlation
- if the value is close to 0, no correlation

In [None]:
df.select_dtypes(include='number').columns

In [None]:
num_cols=df.select_dtypes(include='number').columns
for col in num_cols:
    plt.scatter(df[col], df.Price)
    plt.title(col)
    model = LinearRegression()
    model.fit(df[[col]], df.Price)
    plt.plot(df[col], model.predict(df[[col]]), color='red')
    plt.show()

for categorical independent variables with nuerical target

In [None]:
df.columns

In [None]:
cat_cols = [ 'City', 'Beds', 'Baths',]
for col in cat_cols:
    df.boxplot(column='Price', by=col, grid=False)
    plt.title(col)

# Testing with anova
- for each features, calculate the f-statistic and p-value
- if p-value < 0.05, reject null hypothesis(means the column is not important)
- if p-value > 0.05, fail to reject null hypothesis(means the column is not impotant)

In [None]:
from scipy.stats import f_oneway
for col in cat_cols:
    print(col)
    groups = df.groupby(col).groups
    data = [df.Price[groups[i]] for i in groups]
    f, p = f_oneway(*data)
    print(f'F-Statistic: {f:.2f}, P-Value: {p:.2f}')

In [None]:
selected_cols = ['SquareFeet', 'Beds', 'Baths', 'Type']
X= df[selected_cols]
y= df.Price

In [None]:
X['Type'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
num_cols = X.select_dtypes(include='number').columns
cat_cols = X.select_dtypes(include='object').columns
num_pipe = Pipeline([
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', OneHotEncoder(), cat_cols)
])
model = Pipeline([
    ('pre', preprocessor),
    ('lr', LinearRegression())
])
model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
y_test_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
print("Training Results")
print("MSE:", mean_squared_error(y_train, y_train_pred))
print("MAE:", mean_absolute_error(y_train, y_train_pred))
print("R2:", r2_score(y_train, y_train_pred))
print('-'*50)
print("Testing Results")
print("MSE:", mean_squared_error(y_test, y_test_pred))
print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("R2:", r2_score(y_test, y_test_pred))

Model persistence

In [None]:
import joblib
import os
#create a directory to save models
os.makedirs(os.path.join(os.getcwd(), 'models'), exist_ok=True)

#save model - dumping
path= os.path.join(os.getcwd(), 'models', 'house_price_model')
joblib.dump(model, path)

Loading a saved model

In [None]:
path = os.path.join(os.getcwd(), 'models', 'house_price_model')
model=joblib.load(path)
model

Making predictions with model

In [None]:
#very simple example data

print(model.predict(X.head()))

In [None]:
X.Type.unique().tolist()

In [None]:
#manual data input

X.columns.tolist()
inpX = pd.DataFrame({
    'SquareFeet': [2000], 
    'Beds': [3], 
    'Baths': [2], 
    'Type': ['Condo']
    })
result = model.predict(inpX)
print(*result, sep='\n')  # shortcut to print each element of a list on a new line

Evaluating the training and testing performance of a model

In [None]:
from sklearn.model_selection import learning_curve, cross_val_score
train_sizes = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5,
                                                        train_sizes=train_sizes)

#print(train_sizes)
#print(train_scores.mean(axis=1))
#print(test_scores.mean(axis=1))

#cross validation
plt.plot(train_sizes, train_scores.mean(axis=1), label='Train', marker='o')
plt.plot(train_sizes, test_scores.mean(axis=1), label='Test', marker='o')
plt.grid()
plt.legend()
plt.show()

making a decision tree model, to check the updated accuracy

In [None]:
from sklearn.tree import DecisionTreeRegressor, plot_tree

In [None]:
model2 = Pipeline([
    ('pre', preprocessor),
    ('dt', DecisionTreeRegressor())
])
model2.fit(X_train, y_train)
y_train_pred = model2.predict(X_train)
y_test_pred = model2.predict(X_test)

print("Training Results")
print("MSE:", mean_squared_error(y_train, y_train_pred))
print("MAE:", mean_absolute_error(y_train, y_train_pred))
print("R2:", r2_score(y_train, y_train_pred))
print('-'*50)
print("Testing Results")
print("MSE:", mean_squared_error(y_test, y_test_pred))
print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("R2:", r2_score(y_test, y_test_pred))

In [None]:
fig = plt.figure(figsize=(30,30))
_= plot_tree(
    model2.named_steps['dt'],
    filled=True,
    max_depth=5,
    fontsize=14,
)