In [205]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.graphics.gofplots import qqplot
from numpy.linalg import inv
from joblib import dump, load
from sklearn.impute import SimpleImputer

# *1. Baseline Model*

### Retrieve Data

In [206]:
df_baseline = pd.read_csv('../data/processed/scraped_apartment_sales_baseline.csv')
df = pd.read_csv('../data/processed/scraped_apartment_sales_processed.csv')

df_baseline = df_baseline.drop('Unnamed: 0', axis=1)
df = df.drop('Unnamed: 0', axis=1)
df = df.drop(['broker', 'brokerage_firm', 'region', 'floor_number'], axis=1)


df_baseline.head()

Unnamed: 0,number_of_rooms,area_size,has_elevator,year_built,annual_fee_sek,annual_cost_sek,region_processed_bromma,region_processed_centrala sundbyberg,region_processed_gröndal,region_processed_hammarby sjöstad,...,region_processed_östermalm,has_balcony_nej,has_balcony_unknown,cleaned_floor_number,brokerage_firm_processed_erik olsson fastighetsförmedling,brokerage_firm_processed_fastighetsbyrån stockholm - södermalm,brokerage_firm_processed_länsförsäkringar fastighetsförmedling solna,brokerage_firm_processed_notar,brokerage_firm_processed_tradition mäkleri,price_sold_sek
0,3.0,74,False,1953.0,4899,9000,False,False,False,False,...,False,False,False,2.0,False,False,False,False,False,4400000
1,1.0,25,True,1961.0,2265,3120,False,False,False,False,...,False,False,True,4.0,False,False,False,False,False,1015000
2,1.0,40,True,1955.0,3250,4500,False,False,False,False,...,False,False,False,2.0,False,False,False,False,False,1395000
3,1.0,31,True,1963.0,1290,6804,False,False,False,False,...,False,False,False,1.0,False,False,False,True,False,3150000
4,1.0,40,True,1972.0,2947,4200,False,False,False,False,...,False,False,False,1.0,False,False,False,False,False,1750000


In [207]:
X = df_baseline.drop('price_sold_sek', axis=1)
y = df_baseline['price_sold_sek']

### Split Data

In [208]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

### Load Baseline Model and Predict

In [209]:
baseline_model = load('../models/baseline_model.joblib')

In [210]:
baseline_predictions = baseline_model.predict(X_test)

In [211]:
# Various performance metrics
mse = mean_squared_error(y_test, baseline_predictions)
rmse = mean_squared_error(y_test, baseline_predictions, squared=False)
mae = mean_absolute_error(y_test, baseline_predictions)
r2 = r2_score(y_test, baseline_predictions)

print('Baseline results:')
print(f"MSE: {mse} \nRMSE: {rmse} \nMAE: {mae} \nR-squared: {r2}")

Baseline results:
MSE: 3404845047266.328 
RMSE: 1845222.2216487445 
MAE: 1363035.2040760939 
R-squared: 0.5441301212408783


# *2. Impute Missing Data*

In [212]:
df.columns

Index(['price_sold_sek', 'number_of_rooms', 'area_size', 'has_elevator',
       'year_built', 'annual_fee_sek', 'annual_cost_sek',
       'region_processed_bromma', 'region_processed_centrala sundbyberg',
       'region_processed_gröndal', 'region_processed_hammarby sjöstad',
       'region_processed_kungsholmen', 'region_processed_råsunda',
       'region_processed_södermalm', 'region_processed_vasastan',
       'region_processed_årsta', 'region_processed_östermalm',
       'has_balcony_nej', 'has_balcony_unknown', 'cleaned_floor_number',
       'brokerage_firm_processed_erik olsson fastighetsförmedling',
       'brokerage_firm_processed_fastighetsbyrån stockholm - södermalm',
       'brokerage_firm_processed_länsförsäkringar fastighetsförmedling solna',
       'brokerage_firm_processed_notar',
       'brokerage_firm_processed_tradition mäkleri'],
      dtype='object')

In [213]:
df.isna().sum()

price_sold_sek                                                            0
number_of_rooms                                                           0
area_size                                                                 0
has_elevator                                                              0
year_built                                                                0
annual_fee_sek                                                            0
annual_cost_sek                                                           0
region_processed_bromma                                                   0
region_processed_centrala sundbyberg                                      0
region_processed_gröndal                                                  0
region_processed_hammarby sjöstad                                         0
region_processed_kungsholmen                                              0
region_processed_råsunda                                                  0
region_proce

In [214]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

class PredictionBasedImputer:
    """
    A class for imputing missing values in a specified column of a DataFrame using predictive modeling.

    This class uses a linear regression model to predict and impute missing values in the target column.
    It assumes that other columns in the DataFrame can be used as features for the regression model.
    Data leakage can occur if the imputer is not used correctly. Specifically, if the model is trained 
    on data that includes the test set, or if the imputation strategy is learned from the entire dataset 
    rather than just the training set.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    target_col (str): The name of the column in the DataFrame for which missing values need to be imputed.

    Methods:
    train_model: Trains the linear regression model using available data.
    impute: Imputes missing values in the target column using the trained model.
    """
    def __init__(self, df, target_col):
        self.df = df
        self.target_col = target_col
        self.model = LinearRegression()

    def train_model(self):
        # Ensure there are missing values in target_col
        if not self.df[self.target_col].isnull().any():
            raise ValueError(f"No missing values found in target column '{self.target_col}'.")

        df_with_target = self.df[self.df[self.target_col].notnull()]
        df_missing_target = self.df[self.df[self.target_col].isnull()]

        X = df_with_target.drop(columns=[self.target_col])
        y = df_with_target[self.target_col]

        # Check if X is empty
        if X.empty:
            raise ValueError("No data available to train the model.")

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
        self.model.fit(X_train, y_train)

        return df_missing_target.drop(columns=[self.target_col])

    def impute(self):
        df_missing_target = self.train_model()

        # Check if df_missing_target is empty
        if df_missing_target.empty:
            print("No rows with missing target values to impute.")
            return self.df

        predicted_values = self.model.predict(df_missing_target)
        self.df.loc[self.df[self.target_col].isnull(), self.target_col] = predicted_values

        return self.df

# Example usage
imputer = PredictionBasedImputer(df, 'cleaned_floor_number')
df = imputer.impute()


# *5. Process Outliers*

In [215]:
import pandas as pd
import numpy as np

def quantile_based_flooring_and_outlier_tracking(df_or_series, lwr=5, upr=95):
    """
    Apply quantile based flooring to outliers in a DataFrame or Series.
    Outliers are identified using the IQR method and replaced using quantile flooring.
    Skips boolean and non-numeric columns. Tracks and prints the changes made.

    Parameters:
    df_or_series (pd.DataFrame or pd.Series): DataFrame or Series to process.
    lwr (int): Lower percentile for flooring.
    upr (int): Upper percentile for flooring.
    """
    def quantile_based_flooring(series, lwr, upr):
        if series.empty:
            return series  # Return empty series as is

        tenth_percentile = np.percentile(series, lwr)
        ninetieth_percentile = np.percentile(series, upr)
        output = np.where(series < tenth_percentile, tenth_percentile, series)
        output = np.where(output > ninetieth_percentile, ninetieth_percentile, output)
        return output

    changes = {}  # Dictionary to track changes

    if isinstance(df_or_series, pd.Series):
        df_or_series = pd.DataFrame(df_or_series)

    for column in df_or_series.columns:
        # Skip non-numeric and boolean columns
        if pd.api.types.is_numeric_dtype(df_or_series[column]) and not pd.api.types.is_bool_dtype(df_or_series[column]):
            series = df_or_series[column].copy()

            # Calculate Q1 (25th percentile) and Q3 (75th percentile)
            Q1 = series.quantile(0.25)
            Q3 = series.quantile(0.75)
            IQR = Q3 - Q1

            # Define bounds for outliers
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            # Identify outliers
            outliers = series[(series < lower_bound) | (series > upper_bound)]
            
            # Skip if no outliers are found
            if outliers.empty:
                continue

            flooring_values = quantile_based_flooring(outliers, lwr, upr)

            # Track changes
            for idx, val in zip(outliers.index, flooring_values):
                if series.loc[idx] != val:
                    changes.setdefault(column, []).append((idx, series.loc[idx], val))
                    series.loc[idx] = val

            # Update the DataFrame or Series
            df_or_series[column] = series

    # Display changes
    for column, column_changes in changes.items():
        print(f"Column: {column}")
        for change in column_changes:
            print(f"Index: {change[0]}, Original: {change[1]}, New: {change[2]}")

# Example usage
quantile_based_flooring_and_outlier_tracking(df)

Column: price_sold_sek
Index: 103, Original: 23700000, New: 20000000.0
Index: 427, Original: 21900000, New: 20000000.0
Index: 473, Original: 9200000, New: 9300000.0
Index: 923, Original: 9200000, New: 9300000.0
Index: 1094, Original: 23300000, New: 20000000.0
Index: 1223, Original: 25600000, New: 20000000.0
Index: 1553, Original: 9250000, New: 9300000.0
Index: 1912, Original: 9225000, New: 9300000.0
Index: 2069, Original: 26500000, New: 20000000.0
Column: area_size
Index: 73, Original: 1523, New: 1032.5
Index: 104, Original: 215.0, New: 287.0
Index: 107, Original: 215.0, New: 287.0
Index: 215, Original: 1191.0, New: 1032.5
Index: 275, Original: 1045.0, New: 1032.5
Index: 427, Original: 180.0, New: 287.0
Index: 465, Original: 1065.0, New: 1032.5
Index: 514, Original: 1085.0, New: 1032.5
Index: 542, Original: 180.0, New: 287.0
Index: 640, Original: 1388.0, New: 1032.5
Index: 703, Original: 1095.0, New: 1032.5
Index: 708, Original: 1165.0, New: 1032.5
Index: 726, Original: 1283.0, New: 10

  series.loc[idx] = val
  series.loc[idx] = val
  series.loc[idx] = val


### Train Model

In [216]:
X = df.drop(['price_sold_sek'], axis=1)
y = df[['price_sold_sek']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [217]:
model = LinearRegression()

# Train your model with the transformed y_train
model.fit(X_train, y_train)

# Make predictions (these predictions will be in the log scale)
predictions = model.predict(X_test)

In [218]:
# Various performance metrics
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print('Results after dealing with missing values and outliers')
print(f"MSE: {mse} \nRMSE: {rmse} \nMAE: {mae} \nR-squared: {r2}")

Results after dealing with missing values and outliers
MSE: 3132782480432.863 
RMSE: 1769966.8020708363 
MAE: 1308955.0947863453 
R-squared: 0.5103089509917705


# *6. Data Transformations*

### Log Transformation of Dependent Variables

In [219]:
# Perform Log transformation
y_train_log = np.log1p(y_train)

#### Train Model

In [220]:
model = LinearRegression()

# Train your model with the transformed y_train
model.fit(X_train, y_train_log)

# Make predictions (these predictions will be in the log scale)
log_predictions = model.predict(X_test)

# Transform predictions back to original scale
predictions = np.expm1(log_predictions)

In [221]:
# Various performance metrics
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print('Log transformation results:')
print(f"MSE: {mse} \nRMSE: {rmse} \nMAE: {mae} \nR-squared: {r2}")

Log transformation results:
MSE: 2836678595029.619 
RMSE: 1684244.220720267 
MAE: 1154643.4889733258 
R-squared: 0.5565934993650403


- Log transformation had a slight performance improvement in all metrics

### Feature Scaling

In [222]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Transform (scale) both the training and test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Now, you can use the scaled data for model training and predictions
model.fit(X_train_scaled, y_train_log)

# Make predictions (these predictions will be in the log scale)
log_predictions = model.predict(X_test_scaled)  # Corrected to use X_test_scaled

# Transform predictions back to original scale
predictions = np.expm1(log_predictions)

#### Train Model

In [223]:
# Various performance metrics
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"MSE: {mse} \nRMSE: {rmse} \nMAE: {mae} \nR-squared: {r2}")

MSE: 2836678595029.5957 
RMSE: 1684244.22072026 
MAE: 1154643.4889733247 
R-squared: 0.556593499365044


# *7. Feature Engineering*

In [224]:
df

Unnamed: 0,price_sold_sek,number_of_rooms,area_size,has_elevator,year_built,annual_fee_sek,annual_cost_sek,region_processed_bromma,region_processed_centrala sundbyberg,region_processed_gröndal,...,region_processed_årsta,region_processed_östermalm,has_balcony_nej,has_balcony_unknown,cleaned_floor_number,brokerage_firm_processed_erik olsson fastighetsförmedling,brokerage_firm_processed_fastighetsbyrån stockholm - södermalm,brokerage_firm_processed_länsförsäkringar fastighetsförmedling solna,brokerage_firm_processed_notar,brokerage_firm_processed_tradition mäkleri
0,4400000,3.0,74.0,False,1953.0,4899.0,9000.0,False,False,False,...,False,False,False,False,2.000000,False,False,False,False,False
1,1015000,1.0,25.0,True,1961.0,2265.0,3120.0,False,False,False,...,False,False,False,True,4.000000,False,False,False,False,False
2,7450000,3.0,725.0,False,2023.0,4428.0,0.0,False,False,False,...,False,False,False,False,3.884680,False,False,False,False,False
3,1395000,1.0,40.0,True,1955.0,3250.0,4500.0,False,False,False,...,False,False,False,False,2.000000,False,False,False,False,False
4,3150000,1.0,31.0,True,1963.0,1290.0,6804.0,False,False,False,...,False,False,False,False,1.000000,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2111,2580000,3.0,77.0,True,2016.0,4240.0,3600.0,False,False,False,...,False,False,False,False,3.000000,False,False,False,False,False
2112,3150000,2.0,56.0,True,1962.0,2446.0,6100.0,False,False,False,...,False,False,False,False,3.000000,False,False,False,False,False
2113,11000000,5.0,114.0,False,1962.0,7131.0,6200.0,False,False,False,...,False,False,False,False,3.271598,False,False,False,False,False
2114,8400000,2.0,64.0,False,1908.0,1739.0,0.0,False,False,False,...,False,True,False,False,2.000000,False,False,False,False,False


#### Relative Floor

In [225]:
# Define a function to process each floor_number value
def process_floor_number(value):
    try:
        split_value = value.split(' ')
        if len(split_value) == 3 and split_value[2].isdigit():
            numerator = int(split_value[0])
            denominator = int(split_value[2])
            return numerator / denominator if denominator != 0 else 0
        elif value.isdigit():
            return float(value)
        else:
            return None
    except (ValueError, AttributeError):
        return None

# Apply this function to each value in the floor_number column
df['relative_floor'] = df['floor_number'].apply(process_floor_number)

df['relative_floor'].head()

KeyError: 'floor_number'

#### Municipality

In [None]:
def extract_municipality(region_value):
    if 'kommun' in region_value:
        split_value = region_value.split(',')
        return split_value[1].strip() if len(split_value) > 1 else region_value.strip()
    else:
        return region_value.strip()

# Apply this function to the region column
df['municipality'] = df['region'].apply(extract_municipality)

# Identify the top 10 municipalities
top_10_municipalities = df['municipality'].value_counts().nlargest(10).index.tolist()

# Define a new function to assign 'other' to municipalities not in the top 10
def extract_municipality_top10(region_value):
    municipality = extract_municipality(region_value)
    return municipality if municipality in top_10_municipalities else 'other'

# Apply the modified function
df['municipality'] = df['region'].apply(extract_municipality_top10)

# Check the value counts of the new column
print(df['municipality'].value_counts())


municipality
other                  1491
södermalm               132
vasastan                120
kungsholmen             116
östermalm                53
bromma                   40
årsta                    37
hammarby sjöstad         35
råsunda                  34
centrala sundbyberg      31
gröndal                  27
Name: count, dtype: int64


In [None]:
df = pd.get_dummies(df, columns=['municipality'])
df.head()

Unnamed: 0,region,price_sold_sek,number_of_rooms,area_size,floor_number,has_elevator,year_built,annual_fee_sek,annual_cost_sek,broker,...,municipality_centrala sundbyberg,municipality_gröndal,municipality_hammarby sjöstad,municipality_kungsholmen,municipality_other,municipality_råsunda,municipality_södermalm,municipality_vasastan,municipality_årsta,municipality_östermalm
0,bagarmossen,4400000,3.0,74,2 av 3,False,1953.0,4899,9000,maria lundberg,...,False,False,False,False,True,False,False,False,False,False
1,åby,1015000,1.0,25,4 av 4,True,1961.0,2265,3120,caroline östlund,...,False,False,False,False,True,False,False,False,False,False
2,vasastan / hagastaden,7450000,3.0,725,unknown,False,2023.0,4428,0,marianna maatouk,...,False,False,False,False,True,False,False,False,False,False
3,hässelby gård,1395000,1.0,40,2 av 3,True,1955.0,3250,4500,martina hultkrantz,...,False,False,False,False,True,False,False,False,False,False
4,södermalm,3150000,1.0,31,1,True,1963.0,1290,6804,sanna treijs,...,False,False,False,False,False,False,True,False,False,False


### Train Model

In [None]:
df_model = df.copy()

df_model.fillna(0, inplace=True)


# Directly create X by dropping unnecessary columns
# This step combines both the creation of df_no_missing_data and X from your code
X = df_model.drop([
  'price_sold_sek', 
  'broker', 
  'brokerage_firm', 
  'region', 
  'floor_number', 
  'cleaned_floor_number'
  ], axis=1)


y = df_model[['price_sold_sek']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Transform (scale) both the training and test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Now, you can use the scaled data for model training and predictions
model.fit(X_train_scaled, y_train_log)

# Make predictions (these predictions will be in the log scale)
log_predictions = model.predict(X_test_scaled)  # Corrected to use X_test_scaled

# Transform predictions back to original scale
predictions = np.expm1(log_predictions)

ValueError: Found input variables with inconsistent numbers of samples: [1587, 1419]

In [None]:
# Various performance metrics
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"MSE: {mse} \nRMSE: {rmse} \nMAE: {mae} \nR-squared: {r2}")

# *8. Export Loaded Data & Model*