### Importing Libraries


In [316]:
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer
from sklearn.impute import IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import LabelEncoder

## Importing data

### Prediction

In [317]:
# Filling data path
train_data_path = "./data/train.csv"

# reading CSV file
df = pd.read_csv(train_data_path)
df.drop_duplicates(inplace=True)
# df.columns
# df.describe()

### Submission
##### Splitting feature and target data and concatenating the Dataframes

In [318]:
# # Filling data path
# train_data_path = "./data/train.csv"
# test_data_path = "./data/test.csv"

# # reading CSV file
# df_train = pd.read_csv(train_data_path)
# df_test = pd.read_csv(test_data_path)
# df = pd.concat([df_train, df_test], axis=0).sort_values("id").reset_index()



### Descriptives

In [319]:
# printing descriptives

# print(df.info())
# print(df.describe())

#### Data cleaning
- Dropping columns
- Creating Floor 1 Dummy
- Removing outliers from the num_rooms and square_meters

In [320]:
# Dropping supermarkets number
df.drop('num_supermarkets', axis=1, inplace=True)

# # Dropping orientation (argue saying that this is hardly inputer and has a 30% of missing data) 
# df.drop('orientation', axis=1, inplace=True)

# Creating floor variable
df[['floor', 'door_num']] = df['door'].str.split('-', n=1, expand=True)
df['floor'] = df['floor'].str[0]
df["floor"] = pd.to_numeric(df["floor"])

# Feature engineering - dummy for floor 1
df['floor_one_dummy'] = df['floor'].apply(lambda x: True if x==1 else False)

# Dropping door and door_num columns (justify: not influential)
df.drop(['door', 'door_num'], axis=1, inplace=True)

# Replacing the outliers with NaN in the number of rooms (justify cutoff value: outliers are very high above 10)
df['num_rooms'] = df['num_rooms'].apply(lambda x: x if x<10 else np.nan)

# Replacing the values of square metres < 40 with NaN (change the cutoff value and see the results)
df.loc[df['square_meters'] < 30, 'square_meters'] = np.nan

## Standardizing

In [321]:
# Standardization
to_standardize = ['square_meters', 'year_built', 'num_crimes']

for i in to_standardize:
    df[i] = (df[i] - np.mean(df[i])) / np.std(df[i])

### Encoding Orientation

In [322]:
# Fixing typo in Orientation column 'soxth'

df['orientation'] = df['orientation'].transform(lambda x: x if x != 'soxth' else 'south')
df['orientation_encoded'] = df['orientation'].apply(lambda x: 1 if x == 'south' else 0)


#### Imputing square meters with the mean by neighborhood

In [323]:
# Imputing square_meters with the mean of their neighborhood (if possible)
neighborhood_sqm_means = df.groupby('neighborhood')['square_meters'].mean().to_dict()

def impute_square_meters(row):
    neighborhood = row['neighborhood']
    square_meters = row['square_meters']
    if pd.isnull(square_meters):
        if neighborhood in neighborhood_sqm_means:
            return neighborhood_sqm_means[neighborhood]
        else:
            return np.nan
    else:
        return square_meters

df['square_meters'] = df.apply(impute_square_meters, axis=1)

#### Imputing num crimes with the mean by neighborhood

In [324]:
# Imputing num_crimes with the mean of their neighborhood (if possible)
neighborhood_crimes_means = df.groupby('neighborhood')['num_crimes'].mean().to_dict()

def impute_num_crimes(row):
    neighborhood = row['neighborhood']
    num_crimes = row['num_crimes']
    if pd.isnull(num_crimes):
        if neighborhood in neighborhood_crimes_means:
            return neighborhood_crimes_means[neighborhood]
        else:
            return np.nan
    else:
        return num_crimes

df['num_crimes'] = df.apply(impute_num_crimes, axis=1)

### Imputing Neighborhood


#### Label Encoding neighborhood

In [325]:
label_encoder = LabelEncoder()

# Filling NaN with a placeholder value for encoding purposes
df['neighborhood'].fillna('NaN_Value', inplace=True)

# Encoding the neighborhood values
df['neighborhood_encoded'] = label_encoder.fit_transform(df['neighborhood'])

# Replacing the placeholder value with NaN again
df['neighborhood_encoded'].replace(label_encoder.transform(['NaN_Value'])[0], float('nan'), inplace=True)

### Imputing neighborhood with KNN

In [326]:
# Select the columns to use for imputation
impute_df = df[['neighborhood_encoded', 'square_meters', 'num_crimes']]

# Initialize the imputer with KNN method
imputer = KNNImputer(n_neighbors=5, weights="uniform", metric="nan_euclidean")

# Fit the imputer and transform the data
imputed_data = imputer.fit_transform(impute_df)

# Update the original DataFrame with the imputed values
df[['neighborhood_encoded', 'square_meters', 'num_crimes']] = imputed_data

In [327]:
# Create a new DataFrame for imputation
impute_df = df[['neighborhood_encoded', 'square_meters', 'num_crimes']]

imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(impute_df)
df[['neighborhood_encoded', 'square_meters', 'num_crimes']] = imputed_data

### Target Encoding Neighborhood

#### Encoding using crimes
Not sure about this: looking at the boxplot grouping the data by num_crimes there are several outliers. The outliers stay there after running the function

In [328]:
# df.boxplot(column='num_crimes', by='neighborhood')

# Encoding neighborhood
neighb_mean_crime = df.groupby('neighborhood')['num_crimes'].mean()
df['neighborhood_crime_encoded'] = df['neighborhood'].map(neighb_mean_crime)

def replace_outliers_with_nan(group):
    group_mean = group.mean()
    group_std = group.std()
    lower_bound = group_mean - 3 * group_std
    upper_bound = group_mean + 3 * group_std
    group[~group.between(lower_bound, upper_bound)] = np.nan
    return group

df['num_crimes'] = df.groupby('neighborhood')['num_crimes'].transform(replace_outliers_with_nan)
df['num_crimes'] = df.groupby('neighborhood')['num_crimes'].transform(lambda x: x.fillna(x.mean()))


#### Encoding using square meters

In [329]:
# df.boxplot(column='square_meters', by='neighborhood')
neighb_mean_sqm = df.groupby('neighborhood')['square_meters'].mean()
df['neighborhood_sqm_encoded'] = df['neighborhood'].map(neighb_mean_sqm)

### Imputations

In [330]:
# Printing number of missing values per column
print(df.isna().sum())

id                               0
num_rooms                       46
num_baths                      160
square_meters                    0
orientation                   8000
year_built                     170
is_furnished                   165
has_pool                       156
neighborhood                     0
num_crimes                       0
has_ac                         169
accepts_pets                   155
price                            0
floor                          149
floor_one_dummy                  0
orientation_encoded              0
neighborhood_encoded             0
neighborhood_crime_encoded       0
neighborhood_sqm_encoded         0
dtype: int64


### Imputing Floor with the median

In [331]:
# Calculate the median excluding the missing values
median_floor = df['floor'].median()

# Impute missing values with the median
df['floor'] = df['floor'].fillna(median_floor)

#### Imputing num_rooms and num_baths

In [332]:
# knn_cols = ['num_rooms','num_baths', 'square_meters']
# df_sub = df[knn_cols]
# # Imputing with KNN
# imputer = KNNImputer(n_neighbors=3)
# imputed_data = imputer.fit_transform(df_sub)
# df_sub = pd.DataFrame(imputed_data, columns=df_sub.columns)
# # Putting the imputed columns back in the original df
# df = df.reset_index(drop=True)
# df = df.drop(knn_cols, axis=1)
# df[knn_cols] = df_sub[knn_cols]

In [349]:
cols_to_impute = ['num_rooms', 'num_baths', 'square_meters','year_built', 'num_crimes', 'floor', 'neighborhood_crime_encoded']
pred_cols = ['num_rooms', 'num_baths', 'square_meters','year_built', 'num_crimes', 'floor', 'neighborhood_crime_encoded']
def reg_imputer(df, columns_to_impute: list, predictor_columns: list):

    # Separate the DataFrames
    imputation_df = df[columns_to_impute]
    predictors_df = df[predictor_columns]

    imputer = IterativeImputer(estimator=LinearRegression())
    imputer.fit(predictors_df)
    imputed_values = imputer.transform(imputation_df)
    df[columns_to_impute] = imputed_values
    
    return df

df = reg_imputer(df, cols_to_impute, pred_cols)

## Feature Engineering

In [355]:
# Creating baths per room variable
df['baths_per_room'] = df['num_baths']/df['num_rooms']
df['rooms_squared'] = df['num_rooms']**2

### Correlation matrix

In [335]:
# plt.figure(figsize=(8, 6))
# sns.heatmap(df[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'is_furnished', 'has_pool', 'num_crimes', 'has_ac', 'accepts_pets', 'price', 'floor']].corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
# plt.title("Correlation Matrix")
# plt.show()

## Regression

In [357]:
features = ['num_rooms', 'num_baths', 'square_meters', 'floor', 'num_crimes', 'neighborhood_crime_encoded', 'floor_one_dummy','year_built', 'baths_per_room']
target = 'price'

# Regression output
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2)


# Convert the data to appropriate types
X_train = X_train.astype(float)
y_train = y_train.astype(float)

# Add a constant term to the features
X_train = sm.add_constant(X_train)

# Fit the OLS model
model = sm.OLS(y_train, X_train)
results = model.fit()

# Print the summary of the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.592
Model:                            OLS   Adj. R-squared:                  0.592
Method:                 Least Squares   F-statistic:                     1032.
Date:                Sun, 29 Oct 2023   Prob (F-statistic):               0.00
Time:                        19:49:03   Log-Likelihood:                -42073.
No. Observations:                6400   AIC:                         8.417e+04
Df Residuals:                    6390   BIC:                         8.423e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               

## Prediction

In [358]:
features = ['num_rooms', 'num_baths', 'square_meters', 'floor', 'num_crimes', 'neighborhood_crime_encoded', 'floor_one_dummy', 'year_built', 'baths_per_room']
target = ['price']

def prediction_accuracy(df, features, target):
    mse_list = []
    num_of_predictions = 2000
    for i in range (num_of_predictions):
        X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size= 0.2)

        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_list.append(mse)
    return sum(mse_list) / len(mse_list)

print(prediction_accuracy(df, features, target))

29829.222859150927


## Submission

In [338]:
# # Split dataframes again to train model
# df_test = df[df['price'].isna()]
# df_train = df[~df['price'].isna()]

# # Features and target variables
# features = ['num_rooms', 'num_baths','square_meters', 'floor', 'num_crimes', 'neighborhood_crime_encoded','neighborhood_sqm_encoded', 'floor_one_dummy']
# target = ['price']

# # Model training
# model = LinearRegression()
# model.fit(df_train[features], df_train[target])

# # Making Prediction
# y_test = model.predict(df_test[features])
# df_test['pred'] = y_test

# new_df = pd.DataFrame()
# new_df['id'] = df_test['id']
# new_df['pred'] = df_test['pred']

# new_df.to_csv('./check_mse_coherence_should_decrease.csv', index=False)

