In [None]:
## Import Python Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline

In [None]:
## Data Loading and Preprocessing

boston = load_boston()
boston_data = pd.DataFrame(boston.data, columns = boston.feature_names)
boston_data['Price'] = boston.target
X = boston_data.drop('Price', axis=1)
y = boston_data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)
boston_data.info()

In [None]:
## Linear Regression Training and Testing

lr_pipe = LinearRegression()
lr_pipe.fit(X_train, y_train)
y_pred = lr_pipe.predict(X_test)
print(f"RMSE: {round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 3)}")

In [None]:
## Visualize Regression Line
plt.figure(figsize=(4, 3))
plt.scatter(y_test, y_pred)
plt.plot([0,50], [0,50], '--k')
plt.axis('tight')
plt.tight_layout()
plt.xlabel('True price ($1000s)')
plt.ylabel('Predicted price ($1000s)')
plt.show()

In [None]:
## Linear Model Coefficient Estimation

for idx, col_name in enumerate(X_train.columns):
  print(f"The coefficient for {col_name} is {lr_pipe.coef_[idx]}")

In [None]:
plt.scatter(lr_pipe.predict(X_train), lr_pipe.predict(X_train)-y_train, c='b', s=40, alpha=0.5)
plt.hlines(y=0, xmin=0, xmax=50)
plt.xlabel("Fitted")
plt.ylabel("Residuals")
plt.title("Residuals vs fitted")
plt.show()

In [None]:
## Evaluate Linear Regression Model

data_tuples = list(zip(y_test, y_pred))
real_predict = pd.DataFrame(data_tuples, columns= ['Real', 'Predict'])
real_predict['squared_dif'] = (real_predict["Real"]-real_predict["Predict"])**2
real_predict.head(3)
SSE = sum(real_predict['squared_dif'])
TSS = np.var(y_test) * (len(y_test)+1)
real_predict.head(3)
Rsquared = 1 - SEE/TSS
Rsquared

In [None]:
## Prepare for Logistic Regression

boston_data2 = boston_data.copy()
boston_data2['Price'] = pd.qcut(boston_data2['Price'], 2, labels=["low", "high"])
X2 = boston_data2.drop('Price', axis=1)
y2 = boston_data2['Price']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=16)
boston_data2.Price.value_counts()

In [None]:
## Logistic Regression Model Training and Testing

logr_pipe = make_pipeline(StandardScaler(), LogisticRegression(solver='lbfgs'))
logr_pipe.fit(X_train2, y_train2)
y_pred2 = logr_pipe.predict(X_test2)
y_pred2
pd.Series(y_pred2).value_counts()
pd.DataFrame(pd.Series(y_pred2).value_counts(), columns=['count']).assign(pct = lambda x: x.count)