In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:

file_path = "kingco_sales.csv"
df = pd.read_csv(file_path)

missing_values = df.isnull().sum()

threshold = len(df) * 0.5
df = df.dropna(thresh=threshold, axis=1)

for column in df.columns:
    if df[column].isnull().sum() > 0:
        if df[column].dtype in [np.float64, np.int64]:
            df[column].fillna(df[column].median(), inplace=True)
        else:
            df[column].fillna(df[column].mode()[0], inplace=True)

numerical_features = ['sale_price', 'sqft', 'year_built']
outlier_summary = {}

for feature in numerical_features:
    if feature in df.columns:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
        outlier_summary[feature] = len(outliers)

        df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]


final_shape = df.shape

print("Missing values per column before handling:\n", missing_values)
print("\nNumber of outliers removed per feature:\n", outlier_summary)
print("\nDataset shape after cleaning:", final_shape)



Missing values per column before handling:
 Unnamed: 0    0
sale_id       0
sale_price    0
join_year     0
city          0
land_val      0
imp_val       0
year_built    0
sqft_lot      0
sqft          0
sqft_1        0
stories       0
beds          0
bath_full     0
bath_3qtr     0
bath_half     0
dtype: int64

Number of outliers removed per feature:
 {'sale_price': 37742, 'sqft': 10490, 'year_built': 0}

Dataset shape after cleaning: (543281, 16)


In [3]:

df = pd.read_csv("kingco_sales.csv")

# Create new feature: house_age
if 'join_year' in df.columns and 'year_built' in df.columns:
    df['house_age'] = df['join_year'] - df['year_built']
    house_age_stats = df['house_age'].describe()
    print("'house_age' feature created successfully. Summary statistics:")
    print(house_age_stats)
else:
    print("Required columns 'join_year' and 'year_built' are not present in the dataset.")


'house_age' feature created successfully. Summary statistics:
count    591513.000000
mean         49.286279
std          29.337557
min           0.000000
25%          24.000000
50%          46.000000
75%          70.000000
max         125.000000
Name: house_age, dtype: float64


In [4]:

from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("kingco_sales.csv")

# Perform 80/20 SLit
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

#sizes of the splits
print(f"Training set size: {len(train_df)} rows")
print(f"Testing set size: {len(test_df)} rows")


Training set size: 473210 rows
Testing set size: 118303 rows


In [5]:


df = pd.read_csv("kingco_sales.csv")


# Create engineered feature: house_age
df['house_age'] = df['join_year'] - df['year_built']

# Select features for modeling
features = ['sqft', 'beds', 'bath_full', 'bath_half', 'house_age']
X = df[features]
y = df['sale_price']

#split X and Y for modeling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model performance
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Linear Regression Model Performance:")
print(f"R-squared: {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")




Linear Regression Model Performance:
R-squared: 0.2101
Root Mean Squared Error (RMSE): 496,496.33


In [7]:
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

df = pd.read_csv("kingco_sales.csv")

df['house_age'] = df['join_year'] - df['year_built']

features = ['sqft', 'beds', 'bath_full', 'bath_half', 'house_age']
X = df[features]
y = df['sale_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

scatter_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_pred
})

fig = px.scatter(scatter_df, x='Actual Price', y='Predicted Price',
                 title='Actual vs Predicted House Prices',
                 labels={'Actual Price': 'Actual Sale Price', 'Predicted Price': 'Predicted Sale Price'},
                 trendline='ols')

# Save
fig.write_image("actual_vs_predicted.png")
fig.write_json("actual_vs_predicted.json")
