### King County Housing Data Analysis

In [None]:
# Step1. Important Necessary Libraries
# Step 2. Data Collection and Analysis
# Step 3. Data Preprocessing
# Step 4. Dataset Visualization
# Step 5. Feature engineering

### Step1. Important Necessary Libraries

In [None]:
# Data Manipulation 
import pandas as pd 
import numpy as np

# Data Visualization 
import matplotlib.pyplot as plt 
import seaborn as sns 

# Data Cleaning
import klib 


# Feature Scaling and Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### Step 2. Data Collection and Analysis

In [None]:
# Loading the House price dataset 
df = pd.read_csv( r"C:\Users\Maftuna\Desktop\kc_house_data.csv" )

In [None]:
# Printing  the first 5 rows of the dataset 
df.head(5)

In [None]:
# Printing the last 5 rows of the dataset
df.tail(5)

In [None]:
# Getting the statistical measures of the data
df.describe()

In [None]:
# Number of rows and columns in the dataset
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")


### Step 3. Data Preprocessing

In [None]:
# Print the 5 row of dataset
df.head(5)

In [None]:
# Checking Missing Values 
df.isnull().sum()

> No columns have missing values in this dataset.

In [None]:
# Checking for Duplicate Values
duplicates_count = df.duplicated().sum()
print("Duplicate Values:", duplicates_count)  

> No columns have duplicate vlues too in this dataset.

In [None]:
# Check for unique values in categorical columns
print("Unique values per categorical column:")
categorical_columns = ['zipcode', 'waterfront', 'view', 'condition', 'grade']
for col in categorical_columns:
    print(f"{col}: {df[col].nunique()} unique values")

In [None]:
# Convert 'date' to datetime format
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [None]:
# Extract features from 'date'
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df['Weekday'] = df['date'].dt.weekday

In [None]:
# After extracting the features, we can drop it
df = df.drop('date', axis=1)

In [None]:
# Check "waterfront" column
print("Waterfront distribution:")
print(df['waterfront'].value_counts())

In [None]:
# Check "view" column
print("View distribution:")
print(df['view'].value_counts())

> I was curious about these two features because the distribution didn't seem quite even.
>
> As I discovered, 'waterfront' is overwhelmingly 0 (21,458 entries) compared to just 163 entries with 1.
>
> Likewise, 'view' is mostly 0 (19,489 entries), with relatively few properties rated 1 through 4.
>
> This imbalance could influence how these features impact our analysis or models.



In [None]:
# Information about the dataset
df.info()

In [None]:
if 'id' in df.columns:
    df = df.drop(columns=['id'])
else:
    print("Column 'id' not found. It may have been removed already.")

In [None]:
df=klib.data_cleaning(df)

In [None]:
# Dataset Information After Klib Cleaning
df.info()

### Step 4. Dataset Visualation

In [None]:
# 1. Distribution of House Prices
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], bins=50, kde=True, color='blue')
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 2. Correlation Heatmap for Numerical Features
plt.figure(figsize=(12, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

In [None]:
# 3. Scatter Plot: Price vs. Living Area (sqft_living)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sqft_living', y='price', data=df, alpha=0.6)
plt.title('Price vs. Living Area')
plt.xlabel('Living Area (sqft)')
plt.ylabel('Price')
plt.show()

In [None]:
# 4. Boxplot to Detect Outliers in House Prices
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['price'], color='green')
plt.title('Boxplot of House Prices')
plt.xlabel('Price')
plt.show()

In [None]:
# Waterfront vs. price comparison
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['waterfront'], y=df['price'])
plt.title("Price Comparison Between Waterfront and Non-Waterfront Properties")
plt.xlabel("Waterfront (0 = No, 1 = Yes)")
plt.ylabel("Price")
plt.show()


In [None]:
# zipcode vs. price comparison 
plt.figure(figsize=(14, 6))
sns.boxplot(x=df['zipcode'], y=df['price'])
plt.xticks(rotation=90)
plt.title("House Prices by Zipcode")
plt.xlabel("Zipcode")
plt.ylabel("Price")
plt.show()

### Step 5. Feature Engineering

In [None]:
# Print 5 rows of the dataset
df.head(5)

In [None]:
# Display the column names of the DataFrame
df.columns

In [None]:
# 1. id: Unique identifier for each house.
# 2. date: The date on which the house was sold.
# 3. price: The sale price of the house (in USD).
# 4. bedrooms : Number of bedrooms in the house.
# 5. bathrooms: Number of bathrooms in the house (can include half-baths).
# 6. sqft_living: Interior living space in square feet.
# 7. sqft_lot: Total lot size (land area) in square feet.
# 8. floors: Number of floors (levels) in the house.
# 9. waterfront: Indicates if the house has a waterfront (0 = no, 1 = yes).
# 10. view: Quality of the view from the house (0–4).
# 11. condition: Overall condition of the house (1–5).
# 12. grade: Overall grade of the house (based on King County grading system, 1–13).
# 13. sqft_above: Square footage of the house above ground level (no basement).
# 14. sqft_basement: Square footage of the basement.
# 15. yr_built: The year the house was originally built.
# 16. yr_renovated: The year the house was last renovated (0 if never renovated).
# 17. zipcode: The ZIP code where the house is located.
# 18. lat:  Latitude coordinate of the house’s location.
# 19. long:  Longitude coordinate of the house’s location.
# 20. sqft_living15: Average interior living space (in sqft) of homes in the same neighborhood (based on 15 nearest neighbors).
# 21. sqft_lot15: Average lot size (in sqft) of homes in the same neighborhood (based on 15 nearest neighbors).
# 22. year: Extracted sale year from the date column.
# 23. month: Extracted sale month from the date column (1–12).
# 24. day: Extracted sale day from the date column (1–31).
# 25. weekday: Extracted weekday from the date column (e.g., Monday=0, Sunday=6).

In [None]:
# Create a new feature: house_age
df['house_age'] = df['year'] - df['yr_built']

In [None]:
# Create a feature: price_per_sqft
df['price_per_sqft'] = df['price'] / df['sqft_living']


In [None]:
# Seeing Information about the dataset
df.info()

### Step 6. Standard Scaling

In [None]:
# Select numerical features that need scaling
features_to_scale = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 
                     'lat', 'long', 'sqft_living15', 'sqft_lot15']

scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])


In [None]:
# Display the 5 row of dataset 
df.head(5)