# **Data Study Notebook**

## Objectives

- Answer Business Requirement 1: The client is interested in discovering how house attributes correlate with the sale price.

## Inputs

- outputs/datasets/collection/HousePrices.csv

## Outputs

- Generate code that answers business requirement 1 and can be used to build Streamlit App
- Save plots in folder for documentation


---

# Change working directory

In [None]:
import os

# Set working directory to project root
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
print("Working directory set to", os.getcwd())

## Load Cleaned Data

In [None]:
import pandas as pd

df = pd.read_csv("outputs/datasets/cleaned/HousePricesCleaned.csv")
print(df.shape)
df.head()

## Data Exploration

In [None]:
from pandas_profiling import ProfileReport

pandas_report = ProfileReport(df=df, minimal=True)
pandas_report.to_notebook_iframe()

Create a new directory to save plots

In [None]:
import os

try:
    os.makedirs("docs/plots")
except Exception as e:
    print(e)

Suppress Warnings

In [None]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning, module="ppscore.calculation")

---

## Correlation and PPS Analysis

In [None]:
%matplotlib inline

import numpy as np
import ppscore as pps
import seaborn as sns
sns.set_theme(style="whitegrid")
import matplotlib.pyplot as plt



def heatmap_corr(df,threshold, figsize=(22,12), font_annot = 9):
  """
  Function to create heatmap using correlations.
  """
  if len(df.columns) > 1:
    mask = np.zeros_like(df, dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    mask[abs(df) < threshold] = True

    fig, axes = plt.subplots(figsize=figsize)
    sns.heatmap(df, annot=True, xticklabels=True, yticklabels=True,
                mask=mask, cmap='viridis', annot_kws={"size": font_annot}, ax=axes,
                linewidth=0.5
                     )
    axes.set_yticklabels(df.columns, rotation = 0)
    plt.ylim(len(df.columns),0)
    if df.name == "corr_spearman":
      plt.savefig(f'docs/plots/heatmap_corr_spearman.png', bbox_inches='tight')
    else:
      plt.savefig(f'docs/plots/heatmap_corr_pearson.png', bbox_inches='tight')
    plt.show()


def heatmap_pps(df,threshold, figsize=(22,12), font_annot = 9):
    """
    Function to create heatmap using pps.
    """
    if len(df.columns) > 1:

      mask = np.zeros_like(df, dtype=bool)
      mask[abs(df) < threshold] = True

      fig, ax = plt.subplots(figsize=figsize)
      ax = sns.heatmap(df, annot=True, xticklabels=True,yticklabels=True,
                       mask=mask,cmap='rocket_r', annot_kws={"size": font_annot},
                       linewidth=0.05,linecolor='grey')
      
      plt.ylim(len(df.columns),0)
      plt.savefig(f'docs/plots/heatmap_pps.png', bbox_inches='tight')
      plt.show()


def CalculateCorrAndPPS(df):
  """
  Function to calculate correlations and pps.
  """
  df_corr_spearman = df.corr(method="spearman")
  df_corr_spearman.name = 'corr_spearman'
  df_corr_pearson = df.corr(method="pearson")
  df_corr_pearson.name = 'corr_pearson'

  pps_matrix_raw = pps.matrix(df)
  pps_matrix = pps_matrix_raw.filter(['x', 'y', 'ppscore']).pivot(columns='x', index='y', values='ppscore')

  pps_score_stats = pps_matrix_raw.query("ppscore < 1").filter(['ppscore']).describe().T
  print("PPS threshold - check PPS score IQR to decide threshold for heatmap \n")
  print(pps_score_stats.round(3))

  return df_corr_pearson, df_corr_spearman, pps_matrix


def DisplayCorrAndPPS(df_corr_pearson, df_corr_spearman, pps_matrix,CorrThreshold,PPS_Threshold,
                      figsize=(22,12), font_annot=9 ):
  """
  Function to display the correlations and pps.
  """

  print("\n")
  print("* Analyze how the target variable for your ML models are correlated with other variables (features and target)")
  print("* Analyze multi-colinearity, that is, how the features are correlated among themselves")

  print("\n")
  print("*** Heatmap: Spearman Correlation ***")
  print("It evaluates monotonic relationship \n")
  heatmap_corr(df=df_corr_spearman, threshold=CorrThreshold, figsize=figsize, font_annot=font_annot)

  print("\n")
  print("*** Heatmap: Pearson Correlation ***")
  print("It evaluates the linear relationship between two continuous variables \n")
  heatmap_corr(df=df_corr_pearson, threshold=CorrThreshold, figsize=figsize, font_annot=font_annot)

  print("\n")
  print("*** Heatmap: Power Predictive Score (PPS) ***")
  print(f"PPS detects linear or non-linear relationships between two columns.\n"
        f"The score ranges from 0 (no predictive power) to 1 (perfect predictive power) \n")
  heatmap_pps(df=pps_matrix,threshold=PPS_Threshold, figsize=figsize, font_annot=font_annot)

In [None]:
DisplayCorrAndPPS(
    df_corr_pearson=df_corr_pearson,
    df_corr_spearman=df_corr_spearman,
    pps_matrix=pps_matrix,
    CorrThreshold=0.3,
    PPS_Threshold=0.2,
    figsize=(15, 12),
    font_annot=11,
)

In [None]:
corr_spearman = (
    df.corr(method="spearman")["SalePrice"]
    .sort_values(key=abs, ascending=False)[1:]
    .head(10)
)
corr_spearman

In [None]:
corr_pearson = (
    df.corr(method="pearson")["SalePrice"]
    .sort_values(key=abs, ascending=False)[1:]
    .head(10)
)
corr_pearson

In [None]:
top_n = 10
set(corr_pearson[:top_n].index.to_list() + corr_spearman[:top_n].index.to_list())

In [None]:
vars_to_study = [
    "1stFlrSF",
    "BsmtFinSF1",
    "GarageArea",
    "GarageYrBlt",
    "GrLivArea",
    "LotArea",
    "MasVnrArea",
    "OpenPorchSF",
    "OverallQual",
    "TotalBsmtSF",
    "YearBuilt",
    "YearRemodAdd",
]
vars_to_study

In [None]:
df_eda = df.filter(vars_to_study + ["SalePrice"])
df_eda.head(10)

In [None]:
sns.set_theme(style="whitegrid")
target_var = "SalePrice"


def plot_target_hist(df, target_var):
    """
    Function to create histogram of the values of target.
    Save histogram into docs/plots for use in dashboard
    """
    plt.figure(figsize=(14, 7))
    sns.histplot(data=df, x=target_var, kde=True)
    plt.title(f"Distribution of {target_var}", fontsize=22)
    plt.savefig(f"docs/plots/hist_plot_{target_var}.png", bbox_inches="tight")
    plt.show()


plot_target_hist(df, target_var)

In [None]:
time = ["YearBuilt", "YearRemodAdd", "GarageYrBlt"]


def plot_lm(df, col, target_var):
    """
    Function to create linear regression plots of the target and
    features with continuous values.
    Figures created saved to folder docs/plots
    """
    plt.figure(figsize=(14, 5))
    sns.lmplot(data=df, x=col, y=target_var, ci=None, line_kws={"color": "orange"})
    plt.title(f"{col}", fontsize=20)
    plt.savefig(f"docs/plots/lm_plot_price_by_{col}.png", bbox_inches="tight")
    plt.show()


def plot_line(df, col, target_var):
    """
    Function creates a line plot of the target and time variables.
    Figures created saved to folder docs/plots
    """
    plt.figure(figsize=(14, 7))
    sns.lineplot(data=df, x=col, y=target_var)
    plt.title(f"{col}", fontsize=20)
    plt.savefig(f"docs/plots/line_plot_price_by_{col}.png", bbox_inches="tight")
    plt.show()


def plot_box(df, col, target_var):
    """
    Function creates a box plot of the target versus categorical variables.
    Figures created saved to folder docs/plots
    """
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x=col, y=target_var)
    plt.title(f"{col}", fontsize=20)
    plt.savefig(f"docs/plots/box_plot_price_by_{col}", bbox_inches="tight")
    plt.show()


for col in vars_to_study:
    if len(df_eda[col].unique()) <= 12:
        plot_box(df_eda, col, target_var)
        print("\n\n")
    else:
        if col in time:
            plot_line(df_eda, col, target_var)
            print("\n\n")
        else:
            plot_lm(df_eda, col, target_var)
            print("\n\n")

---

---

## Conclusions and Next Steps

**Summary**
- The correlations and plots interpretation converge:

    - The year built and year garage built is affecting the salesprice decently little before 1980 while it does it a lot after 1980, the later the bigger.
    - Every variable has outliers.
    - There is bigger correlation in general for sizes than other variables.

**Next Steps**:
- Feature Engineering Notebook to create and validate new features for modeling.

