# Utils Usage for Regression Tasks

In [1]:
import os
import sys
from pathlib import Path

# Add root folder to path (for module imports)
sys.path.insert(0, str(Path.cwd().parent))

# Change working directory (for I/O operations)
os.chdir(Path.cwd().parent)

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import utils scripts
from utils.preprocessing import *

In [3]:
# Read data
df = pd.read_csv('data/regression_data.csv')

# Preview data
display(df.head())

Unnamed: 0,property_id,sqft_living,sqft_lot,bedrooms,bathrooms,floors,year_built,year_renovated,condition,grade,...,city,zipcode,latitude,longitude,distance_to_downtown,school_rating,crime_rate,has_garage,sqft_basement,price
0,PROP_00352,2074.974764,12666.739468,3,2.5,2.0,2023.0,0.0,3,6.0,...,Other,98157,47.312479,-121.844119,5.396556,9.2,0.611506,Yes,608.974799,661229.682478
1,PROP_00689,1207.077982,7376.062644,2,2.0,2.0,1968.0,1996.0,2,7.0,...,Seattle,98100,47.733808,-122.010631,23.112477,4.2,5.565608,TRUE,0.0,583489.046289
2,PROP_00485,3314.232769,2414.299701,2,4.0,3.0,1920.0,0.0,3,6.0,...,Other,98162,47.319195,-122.238151,8.284677,3.5,8.725351,0,636.10598,759121.224769
3,PROP_00388,2388.756396,10702.211271,2,2.0,2.0,2007.0,2020.0,3,8.0,...,Seattle,98196,47.11192,-121.573947,3.453021,8.6,3.260463,Yes,0.0,
4,PROP_00031,1186.961252,5917.575635,1,2.0,2.0,1996.0,0.0,1,9.0,...,Seattle,98045,47.283425,-122.111706,1.565135,6.5,0.751626,1,0.0,603385.593403


In [4]:
# Check data information
info_df = check_data_information(df, df.columns.tolist())
display(info_df)

Unnamed: 0,Feature,Data Type,Null Values,Null Percentage,Duplicated Values,Unique Values,Unique Sample
0,property_id,object,0,0.0,50,1000,"PROP_00352, PROP_00689, PROP_00485, PROP_00388..."
1,sqft_living,float64,92,8.76,50,913,"2074.9747640549976, 1207.0779821460708, 3314.2..."
2,sqft_lot,float64,85,8.1,50,919,"12666.73946770685, 7376.06264433528, 2414.2997..."
3,bedrooms,int64,0,0.0,50,13,"3, 2, 1, 7, 6"
4,bathrooms,float64,0,0.0,50,9,"2.5, 2.0, 4.0, 3.0, 1.5"
5,floors,float64,0,0.0,50,5,"2.0, 3.0, 1.0, 2.5, 1.5"
6,year_built,float64,81,7.71,50,104,"2023.0, 1968.0, 1920.0, 2007.0, 1996.0"
7,year_renovated,float64,83,7.9,50,44,"0.0, 1996.0, 2020.0, 1994.0, 2019.0"
8,condition,int64,0,0.0,50,5,"3, 2, 1, 4, 5"
9,grade,float64,90,8.57,50,13,"6.0, 7.0, 8.0, 9.0, 11.0"
