# Regression

Linear regression is used to determine if a set of predictor variables do a good job in predicting an outcome, or dependent, variable.  It also attempts to determine which particular variables are significant predictors of the outcome variable, and the magnitude and sign they have on the outcome variable.

In [None]:
# Imports
import matplotlib.pyplot as plt  
import numpy as np  
import pandas as pd  
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns 
import sklearn
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_predict 
import statsmodels.api as sm


# Use Simple Linear Regression to Predict Prices for Houses in Memphis in the 38118 Zip Code Based on Square Footage

### First, import the dataset and display the ranges of the values

In [None]:
# Import data for houses in Memphis in the 38118 zip code for Simple Linear Regression
cols=["SF","Price","Beds","Baths","Year_Built","Lot_Size_Acres","Garage_Size","Stories","Brick"]
SLR_df = pd.read_csv('data/Linear_Regression_Memphis.csv', names=cols)


# See the first few lines of data
print("Overview of data:")
print(SLR_df.head(3))


# Display some information about the data
print("\nRanges for each data attribute:")
print("Square footage range is {:,d} to {:,d}.".format(SLR_df["SF"].min(), SLR_df["SF"].max()))
print("Price range is ${:,d} to ${:,d}.".format(SLR_df["Price"].min(), SLR_df["Price"].max()))
print("Number of bedrooms ranges from {:,d} to {:,d}.".format(SLR_df["Beds"].min(), SLR_df["Beds"].max()))
print("Number of baths ranges from {:.1f} to {:.1f}.".format(SLR_df["Baths"].min(), SLR_df["Baths"].max()))
print("Year built ranges from {:d} to {:d}.".format(SLR_df["Year_Built"].min(), SLR_df["Year_Built"].max()))
print("Garage size ranges from {:d} to {:d}.".format(SLR_df["Garage_Size"].min(), SLR_df["Garage_Size"].max()))
print("Number of stories ranges from {:d} to {:d}.".format(SLR_df["Stories"].min(), SLR_df["Stories"].max()))
print("Lot size ranges from {:.2f} to {:.2f} acres.".format(SLR_df["Lot_Size_Acres"].min(), SLR_df["Lot_Size_Acres"].max()))


### Plot a subset of the house prices as a function of the square footage

In [None]:
# Plot house data with no trend line

# Pick a random subset of rows to plot
# Get a subset of values for the plot by picking a random subset
Subset_SLR_df = SLR_df.sample(frac=0.2)

# Create scatterplot
fig = px.scatter(Subset_SLR_df, x="SF", y="Price")
fig.update_xaxes(title_text='SF')
fig.update_yaxes(title_text='Price ($)')
fig.update_layout(title_text='House prices as a function of square footage in Memphis', title_x=0.5)
fig.show()

### Now plot the house prices as a function of the square footage, but this time show a trendline

In [None]:
# Plot house data with a trend line

# Create the scatterplot with trend line
fig = px.scatter(Subset_SLR_df, x="SF", y="Price", trendline="ols")
fig.update_xaxes(title_text='SF')
fig.update_yaxes(title_text='Price ($)')
fig.update_layout(title_text='House prices as a function of square footage in Memphis with trendline', title_x=0.5)
fig.show()

# Prepare to add the trendline
y = Subset_SLR_df["Price"]
x = Subset_SLR_df["SF"]

# Add the intercept
x = sm.add_constant(x)
# Create the model for the trendline and fit it
model = sm.OLS(y,x)
results = model.fit()

print("Equation for trendline = {:.2f}*SF + {:.2f}\n".format(results.params[1], results.params[0]))
print("The R-squared value = {:.3f}".format(results.rsquared))




"R-squared is the percent of variance explained by the model." (https://www.statisticshowto.com/rmse/)
An R-squared value of 1.0 would mean that all values fell exactly on the trendline.  So, the higher the R-squared value, the better.  Values range from 0 to 1.0.

### Let's create a model to predict house prices.  

* We will do this using Simple Linear Regression and the model will only take square footage into account.
* We will divide our data up into 2 groups - the training group and testing group.  80% will be in training, 20% in testing.

In [None]:
# Reshape our data so it is in a usable format
SLR_X = SLR_df['SF'].values.reshape(-1,1)
SLR_y = SLR_df['Price'].values.reshape(-1,1)

# Create train/test splits
SLR_X_train, SLR_X_test, SLR_y_train, SLR_y_test = train_test_split(SLR_X, SLR_y, test_size=0.2, random_state=0)

# Create the regressor and train the model
regressor = LinearRegression()  
regressor.fit(SLR_X_train, SLR_y_train)

# Get the slope
slope = regressor.coef_[0][0]
slope = slope.astype(float)

# Get the intercept
intercept = regressor.intercept_[0]
intercept = intercept.astype(float)


### Make predictions based on the model we just created and see how well it performs.


In [None]:
SLR_y_pred = regressor.predict(SLR_X_test)

# Print the predictions vs actual values
print("Simple Linear Regression Predictions:")
for i in range(0,len(SLR_y_pred)-1):
    print("SF = {:,d}, Prediction = {:,d}, Actual = {:,d}, Difference =  {:,d}".format(int(SLR_X_test[i]),
                                                                                       int(SLR_y_pred[i][0]),
                                                                                       int(SLR_y_test[i][0]),
                                                                                       int(SLR_y_pred[i][0] - SLR_y_test[i][0])))
    i = i + 1

# Print equation for regression
print("\nEquation created during training that is used to make predictions = {:.2f}*SF + {:.2f}".format(slope, intercept))

# Print equation for trendline
print("Equation for trendline = {:.2f}*SF + {:.2f}\n".format(results.params[1], results.params[0]))


### Use the Root Mean Square Error metric to evaluate how well our model performed.  

The RMSE can be interpreted as the standard deviation of the unexplained variance, and has the useful property of being in the same units as the response variable. Lower values of RMSE indicate better fit. RMSE is a good measure of how accurately the model predicts the response, and it is the most important criterion for fit if the main purpose of the model is prediction. (https://www.theanalysisfactor.com/assessing-the-fit-of-regression-models/)

In [None]:
# Use RMSE metric
SLR_RMSE = np.sqrt(sklearn.metrics.mean_squared_error(SLR_y_test,SLR_y_pred))
print('\nRoot Mean Squared Error For Simple Linear Regression: {:.2f}\n'.format(SLR_RMSE))


# Use Multiple Variable Linear Regression to Predict Prices for Houses in Memphis in the 38118 Zip Code.  This time we will use all the variables, not just square footage. Square footage is scaled to acres to have the same units as lot size.

### We are using the same dataset as before.

In [None]:
# Define the columns to be used for regression and import the dataset into a dataframe
cols=["SF","Price","Beds","Baths","Year_Built","Lot_Size_Acres","Garage_Size","Stories","Brick"]
MLR_df = pd.read_csv('data/Linear_Regression_Memphis.csv', names=cols)

# Scale SF to SF_acres
MLR_df["SF_Acres"] = (MLR_df["SF"] / 43560)

print(MLR_df.head(5))

### Let's create a model to predict house prices.
* We will do this using Linear Regression and the model will take all features into account.
* We will divide our data up into 2 groups - the training group and testing group. 80% will be in training, 20% in testing.

In [None]:
# Create 2 subsets of data.  One with attributes we will use for regression (X), and 
# one for the values we are trying to predict - house prices (y)

#MLR_X = MLR_df[["SF", "Beds","Baths","Year_Built","Lot_Size_Acres","Garage_Size","Stories","Brick"]]

MLR_X = MLR_df[["SF_Acres", "Beds","Baths","Year_Built","Lot_Size_Acres","Garage_Size","Stories","Brick"]]
MLR_y = MLR_df["Price"]


# Create train/test splits
MLR_X_train, MLR_X_test, MLR_y_train, MLR_y_test = train_test_split(MLR_X, MLR_y, test_size=0.2, random_state=0)

# Create the regressor and train the model
regressor = LinearRegression()  
regressor.fit(MLR_X_train, MLR_y_train)

### Now that we have the model, let's see how much each of the attributes contributes to making the predictions.

In [None]:
# Get the coefficients for each of the attributes
MLR_coeff_df = pd.DataFrame(regressor.coef_, MLR_X.columns, columns=['Coefficient'])
#print(regressor.coef_)
#print(MLR_X.columns)

# Print the value of each coefficient
print("The coefficients for each attribute are: ")
print(MLR_coeff_df)

# Print the intercept
print("\nThe intercept is {:.2f}\n".format(regressor.intercept_))

# Show the equation
#cols_for_equation=["SF", "Beds","Baths","Year_Built","Lot_Size_Acres","Garage_Size","Stories","Brick"]
cols_for_equation=["SF_Acres", "Beds","Baths","Year_Built","Lot_Size_Acres","Garage_Size","Stories","Brick"]
equation = "Price = "
for i in range(0,len(cols_for_equation)):
    #MLR_coeff_attr = "{:.2f}".format(MLR_coeff_df.iloc[i,0]) 
    equation = equation + "({:.2f}*{}) +".format(MLR_coeff_df.iloc[i,0]/100, cols_for_equation[i])
    #equation = equation + "(" + str(MLR_coeff_attr) + "*" + cols_for_equation[i] + ") + "

equation = equation + "(" + str(regressor.intercept_) + ")"

print("*****\nThe equation created during training that is used to make predictions (coefficients divided by 100) = \n")
print(equation)



### Explanation of coefficients
* The size of the coefficient for each independent variable gives you the size of the effect that variable is having on your dependent variable, in this case, the price.  The sign on the coefficient (positive or negative) gives you the direction of the effect. The coefficient tells you how much the dependent variable is expected to increase when that independent variable increases by one, holding all the other independent variables constant.  (https://dss.princeton.edu/online_help/analysis/interpreting_regression.htm#coefficients)
* The Lot Size has the greatest effect on price.
* The number of stories has a much greater effect than the square footage.  Surprising???  Only 7 of the 86 houses were 2 story, so we need to go back and re-evaluate having this variable in our model.
* Garage size had a negative effect on the price.

### Make predictions based on the model we just created and see how well it performs

In [None]:
# Make predictions
MLR_y_pred = regressor.predict(MLR_X_test)

MLR_Actual_Predicted_df = pd.DataFrame({'Predicted': MLR_y_pred, 'Actual': MLR_y_test})

# Print the predictions vs actual values
print("Predictions:")
for i in range(0,len(MLR_y_pred)-1):
    print("SF = {:,d}, Prediction = {:,d}, Actual = {:,d}, Difference =  {:,d}".format(int(MLR_X_test.iloc[i][0]),
                                                                                       int(MLR_Actual_Predicted_df.iloc[i][1]),
                                                                                       int(MLR_Actual_Predicted_df.iloc[i][0]),
                                                                                       int(MLR_Actual_Predicted_df.iloc[i][1] - int(MLR_Actual_Predicted_df.iloc[i][0]))))
    i = i + 1



### Use the Root Mean Square Error metric to evaluate how well our model performed

In [None]:
# Use RMSE metric
MLR_RMSE = np.sqrt(sklearn.metrics.mean_squared_error(MLR_y_test, MLR_y_pred))
print('\nRoot Mean Squared Error: {:.2f}'.format(MLR_RMSE))

# Comparisons of Simple Linear Regression versus Multiple Variable Linear Regression

### Compare RMSE

In [None]:
# Show RMSE for SLR vs MLR

print(("RMSE for Linear Regression = {:.2f}\nRMSE for Multiple Linear Regression = {:.2f}").format(SLR_RMSE, MLR_RMSE))

### Compare predictions

In [None]:
# Print the SLR predictions vs MLR prediction
print("SLR predictions vs MLR prediction:")
SLR_total = 0
MLR_total = 0
for i in range(0,len(MLR_y_pred)-1):
    
    prediction = ""
#    print("SF = {:,d}, SLR Prediction = {:,d}, MLR Prediction = {:,d}, Actual = {:,d}".format(int(MLR_X_test.iloc[i][0]),
#                                                                                       int(SLR_y_pred[i][0]),
#                                                                                       int(MLR_Actual_Predicted_df.iloc[i][1]),
#                                                                                       int(MLR_Actual_Predicted_df.iloc[i][0])))
    prediction = prediction + "SF = {:,d}, SLR Prediction = {:,d}, MLR Prediction = {:,d}, Actual = {:,d}".format(int(MLR_X_test.iloc[i][0]),
                                                                                       int(SLR_y_pred[i][0]),
                                                                                       int(MLR_Actual_Predicted_df.iloc[i][1]),
                                                                                       int(MLR_Actual_Predicted_df.iloc[i][0]))

    # Calculate differences to determine which mad a better predition
    SLR_diff = abs(int(SLR_y_pred[i][0]) - int(MLR_Actual_Predicted_df.iloc[i][0]))
    MLR_diff = abs(int(MLR_Actual_Predicted_df.iloc[i][1]) - int(MLR_Actual_Predicted_df.iloc[i][0]))

    if (SLR_diff < MLR_diff):
        prediction = prediction + " - SLR made a better prediction"
        SLR_total += 1
    else:
        prediction = prediction + " - MLR made a better prediction"
        MLR_total += 1
        
    print(prediction)
    i = i + 1

print("\nSLR made {:d} better predictions".format(SLR_total))
print("MLR made {:d} better predictions".format(MLR_total))

# Test with new, unseen data to test model

In [None]:
X_columns = ["SF in Acres","Beds","Baths","Year_Built","Lot_Size_Acres","Garage_Size","Stories","Brick"]

# New data #1

X_test = pd.DataFrame(data=[["0.02555096", "3", "2", "1973", "0.15", "0", "1", "1"]])
y_test = "57414"

y_pred = regressor.predict(X_test)

print("New data #1")
print("SF in Acres = {}, Prediction = {:,d}, Actual = {:,d}, Difference = {:,d}".format(X_test.iloc[0][0],
                                                                                     int(y_pred),
                                                                                     int(y_test),
                                                                                     int(int(y_pred) - int(y_test))))

# New data #2
X_test = pd.DataFrame(data=[["0.03466483", "3", "2", "1962", "0.25", "0", "1", "1"]])
y_test = "77276"

y_pred = regressor.predict(X_test)

print("\nNew data #2")
print("SF = {}, Prediction = {:,d}, Actual = {:,d}, Difference = {:,d}".format(X_test.iloc[0][0],
                                                                                     int(y_pred),
                                                                                     int(y_test),
                                                                                     int(int(y_pred) - int(y_test))))


Credits
* https://towardsdatascience.com/a-beginners-guide-to-linear-regression-in-python-with-scikit-learn-83a8f7ae2b4f
* www.plotly.com
* https://www.theanalysisfactor.com/assessing-the-fit-of-regression-models/
* https://www.statisticshowto.com/rmse/
* https://dss.princeton.edu/online_help/analysis/interpreting_regression.htm#coefficients
* All the house information came from www.zillow.com
