### Importing Libraries


In [258]:
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
import numpy as np

## Importing data

### Prediction

In [259]:
# Filling data path
train_data_path = "./data/train.csv"

# reading CSV file
df = pd.read_csv(train_data_path)
df.drop_duplicates(inplace=True)
# df.columns
# df.describe()

### Submission
##### Splitting feature and target data and concatenating the Dataframes

In [260]:
# # Filling data path
# train_data_path = "./data/train.csv"
# test_data_path = "./data/test.csv"

# # reading CSV file
# df_train = pd.read_csv(train_data_path)
# df_test = pd.read_csv(test_data_path)
# df = pd.concat([df_train, df_test], axis=0).sort_values("id").reset_index()



### Descriptives

In [261]:
# printing descriptives

# print(df.info())
# print(df.describe())

#### Data cleaning
- Dropping columns
- Creating Floor 1 Dummy
- Removing outliers from the num_rooms and square_meters

In [262]:
# Dropping supermarkets number
df.drop('num_supermarkets', axis=1, inplace=True)

# Dropping orientation (argue saying that this is hardly inputer and has a 30% of missing data) 
df.drop('orientation', axis=1, inplace=True)

# Creating floor variable
df[['floor', 'door_num']] = df['door'].str.split('-', n=1, expand=True)
df['floor'] = df['floor'].str[0]
df["floor"] = pd.to_numeric(df["floor"])

# Feature engineering - dummy for floor 1
df['floor_one_dummy'] = df['floor'].apply(lambda x: True if x==1 else False)

# Dropping door and door_num columns (justify: not influential)
df.drop(['door', 'door_num'], axis=1, inplace=True)

# Replacing the outliers with NaN in the number of rooms (justify cutoff value: outliers are very high above 10)
df['num_rooms'] = df['num_rooms'].apply(lambda x: x if x<10 else np.nan)

# Replacing the values of square metres < 40 with NaN (change the cutoff value and see the results)
df.loc[df['square_meters'] < 30, 'square_meters'] = np.nan

## Standardizing

In [263]:
# Standardization
to_standardize = ['square_meters', 'year_built', 'num_crimes']

for i in to_standardize:
    df[i] = (df[i] - np.mean(df[i])) / np.std(df[i])

### Encoding Neighborhood

#### Encoding using crimes
Not sure about this: looking at the boxplot grouping the data by num_crimes there are several outliers. The outliers stay there after running the function

In [264]:
# df.boxplot(column='num_crimes', by='neighborhood')

# Encoding neighborhood
neighb_mean_crime = df.groupby('neighborhood')['num_crimes'].mean()
df['neighborhood_crime_encoded'] = df['neighborhood'].map(neighb_mean_crime)

def replace_outliers_with_nan(group):
    group_mean = group.mean()
    group_std = group.std()
    lower_bound = group_mean - 3 * group_std
    upper_bound = group_mean + 3 * group_std
    group[~group.between(lower_bound, upper_bound)] = np.nan
    return group

df['num_crimes'] = df.groupby('neighborhood')['num_crimes'].transform(replace_outliers_with_nan)
df['num_crimes'] = df.groupby('neighborhood')['num_crimes'].transform(lambda x: x.fillna(x.mean()))


#### Encoding using square meters

In [265]:
# df.boxplot(column='square_meters', by='neighborhood')
neighb_mean_sqm = df.groupby('neighborhood')['square_meters'].mean()
df['neighborhood_sqm_encoded'] = df['neighborhood'].map(neighb_mean_sqm)

### Checkig for rows with multiple null values and dropping them

In [266]:
# # Count rows with two null values
# cols = ['square_meters', 'neighborhood']
# df1 = df[cols]
# display(df1)
# null_count = df1[df1.isnull().sum(axis=1) >= 2].shape[0]
# print(null_count)
# df1 = df1[df1.isnull().sum(axis=1) < 2]
# df[cols] = df1[cols]


### Imputations

In [267]:
# Printing number of missing values per column
print(df.isna().sum())

index                            0
id                               0
num_rooms                      291
num_baths                      200
square_meters                  298
year_built                     200
is_furnished                   200
has_pool                       200
neighborhood                   200
num_crimes                     200
has_ac                         200
accepts_pets                   200
price                         2000
floor                          200
floor_one_dummy                  0
neighborhood_crime_encoded     200
neighborhood_sqm_encoded       200
dtype: int64


#### Imputing square meters with the mean by neighborhood

In [268]:
# Imputing square_meters with the mean of their neighborhood (if possible)
neighborhood_sqm_means = df.groupby('neighborhood')['square_meters'].mean().to_dict()

def impute_square_meters(row):
    neighborhood = row['neighborhood']
    square_meters = row['square_meters']
    if pd.isnull(square_meters):
        if neighborhood in neighborhood_sqm_means:
            return neighborhood_sqm_means[neighborhood]
        else:
            return np.nan
    else:
        return square_meters

df['square_meters'] = df.apply(impute_square_meters, axis=1)

#### Imputing num_crimes with the mean of the neighborhood

In [269]:
df['num_crimes']=df['num_crimes'].fillna(df['num_crimes'].mean())

#### Imputing all the rest

In [270]:
knn_cols = ['num_rooms','num_baths', 'square_meters', 'floor', 'neighborhood_crime_encoded', 'neighborhood_sqm_encoded']
df_sub = df[knn_cols]
# Imputing with KNN
imputer = KNNImputer(n_neighbors=3)
imputed_data = imputer.fit_transform(df_sub)
df_sub = pd.DataFrame(imputed_data, columns=df_sub.columns)
# Putting the imputed columns back in the original df
df = df.reset_index(drop=True)
df = df.drop(knn_cols, axis=1)
df[knn_cols] = df_sub[knn_cols]

### Correlation matrix

In [271]:
# plt.figure(figsize=(8, 6))
# sns.heatmap(df[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'is_furnished', 'has_pool', 'num_crimes', 'has_ac', 'accepts_pets', 'price', 'floor']].corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
# plt.title("Correlation Matrix")
# plt.show()

## Prediction

In [272]:
features = ['num_rooms', 'num_baths','square_meters', 'floor', 'num_crimes', 'neighborhood_crime_encoded','neighborhood_sqm_encoded', 'floor_one_dummy']
target = ['price']

def prediction_accuracy(df, features, target):
    mse_list = []
    num_of_predictions = 3000
    for i in range (num_of_predictions):
        X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size= 0.2)

        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_list.append(mse)
    return sum(mse_list) / len(mse_list)

print(prediction_accuracy(df, features, target))

## Submission

In [273]:
# # Split dataframes again to train model
# df_test = df[df['price'].isna()]
# df_train = df[~df['price'].isna()]

# # Features and target variables
# features = ['num_rooms', 'num_baths','square_meters', 'floor', 'num_crimes', 'neighborhood_crime_encoded','neighborhood_sqm_encoded', 'floor_one_dummy']
# target = ['price']

# # Model training
# model = LinearRegression()
# model.fit(df_train[features], df_train[target])

# # Making Prediction
# y_test = model.predict(df_test[features])
# df_test['pred'] = y_test

# new_df = pd.DataFrame()
# new_df['id'] = df_test['id']
# new_df['pred'] = df_test['pred']

# new_df.to_csv('./check_mse_coherence_should_decrease.csv', index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['pred'] = y_test
