## Importing Libraries

In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error

## Importing data

In [3]:
# Filling data path
train_data_path = "./data/train.csv"
test_data_path = "./data/test.csv"

# reading CSV file
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)

# Creating column to differentiate df_test and df_train
df_train['train_dummy'] = 1
df_test['train_dummy'] = 0

df = pd.concat([df_train, df_test], axis=0).sort_values("id").reset_index()
print(df.shape)


(10000, 17)


## Data cleaning
- Dropping columns
- Creating Floor 1 Dummy
- Removing outliers from the num_rooms and square_meters

In [4]:
# Dropping orientation (argue saying that this is hardly inputer and has a 30% of missing data) 
df.drop('orientation', axis=1, inplace=True)

# Creating floor variable
df[['floor', 'door_num']] = df['door'].str.split('-', n=1, expand=True)
df['floor'] = df['floor'].str[0]
df["floor"] = pd.to_numeric(df["floor"])

# Dropping door and door_num columns (justify: not influential)
df.drop(['door', 'door_num'], axis=1, inplace=True)

# Replacing the outliers with NaN in the number of rooms (justify cutoff value: outliers are very high above 10)
df['num_rooms'] = df['num_rooms'].apply(lambda x: x if x<10 else np.nan)

# Replacing the values of square metres < 40 with NaN (change the cutoff value and see the results)
df.loc[df['square_meters'] < 0, 'square_meters'] = np.nan

## Standardizing and imputing

In [5]:
# Standardization
def imputing_missing(impute_df, vars_to_impute):
    standardizer = StandardScaler()
    impute_df[vars_to_impute] = standardizer.fit_transform(df[vars_to_impute])
    imputer = KNNImputer(n_neighbors=7)
    impute_df[vars_to_impute] = imputer.fit_transform(df[vars_to_impute])
    impute_df[vars_to_impute] = standardizer.inverse_transform(df[vars_to_impute])
    return impute_df

vars_to_impute = ['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes', 'floor', 'has_ac', 'accepts_pets', 'is_furnished', 'has_pool']
df = imputing_missing(df, vars_to_impute)

In [6]:
# Re-standardising variables
to_standardize = ['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes', 'floor']

for i in to_standardize:
    df[i] = (df[i] - np.mean(df[i])) / np.std(df[i])

### KNN all variables, aside from num_supermarkets and binaries

In [7]:
# vars_to_impute = ['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes', 'price', 'floor', 'has_ac', 'accepts_pets', 'is_furnished', 'has_pool']
# impute_df = df[vars_to_impute]

# imputer = KNNImputer(n_neighbors=7)
# imputed_data = imputer.fit_transform(impute_df)
# imputed_df = pd.DataFrame(imputed_data, columns = impute_df.columns)

# for var in vars_to_impute:
#     df[var] = imputed_df[var]

#### Imputing neighborhood based on the number of crimes

In [8]:
crimes_by_neighborhood = df.groupby('neighborhood')['num_crimes'].mean().to_dict()

# Function to be used in the imputation process
def find_closest_key(target, dictionary):
    closest_key = None
    closest_difference = float('inf')

    for key, value in dictionary.items():
        difference = abs(target - value)
        if difference < closest_difference:
            closest_key = key
            closest_difference = difference

    return closest_key

# Imputing neighborhood based on number of crimes
def impute_neighborhood(row):
    crimes = row['num_crimes']
    neighborhood = row['neighborhood']
    if pd.isnull(neighborhood):
        return find_closest_key(crimes, crimes_by_neighborhood)
    else:
        return neighborhood
    
df['neighborhood'] = df.apply(impute_neighborhood, axis=1)

#### Imputing number of supermarkets with mean of the neighborhood

In [9]:
neighborhood_means = df.groupby('neighborhood')['num_supermarkets'].mean()
df['num_supermarkets'] = df['neighborhood'].map(neighborhood_means)

#### Creating floor 1 dummy

In [10]:
df['floor_one_dummy'] = df['floor'].apply(lambda x: True if x==1 else False)

#### Encoding neighborhood

In [11]:
df = pd.get_dummies(df, columns=['neighborhood'], prefix=['neigh']).reset_index(drop=True)

#### Seperating the dfs for training and testing

In [14]:
# Seperating test and train
X_train = df[df['train_dummy'] == 1][['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'floor_one_dummy', 'num_supermarkets', 'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta', 'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu', 'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi', 'is_furnished', 'has_pool', 'has_ac', 'accepts_pets']]
y_train = df[df['train_dummy'] == 1][['price']]

X_test = df[df['train_dummy'] == 0][['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'floor_one_dummy', 'num_supermarkets', 'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta', 'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu', 'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi', 'is_furnished', 'has_pool', 'has_ac', 'accepts_pets']]

In [13]:
# Creating function for cross validation

def prediction_accuracy(X_train, y_train, model):
    mse_list = []
    num_of_predictions = 1000
    for i in range (num_of_predictions):
        X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X_train, y_train, test_size= 0.2)

        model.fit(X_train_temp, y_train_temp)

        y_pred = model.predict(X_test_temp)
        mse = mean_squared_error(y_test_temp, y_pred)
        mse_list.append(mse)
    return sum(mse_list) / len(mse_list)


## Model Training

In [15]:
# Test linear regression model
lin_model = LinearRegression()
print(prediction_accuracy(X_train, y_train, lin_model))

29978.155461549693


In [None]:
# Testing LASSO for best alpha

# Lasso Regression
# Define a range of alphas to try
alphas = np.logspace(-10, 0, 50)

# Initialize variables to store the best alpha and minimum MSE
best_alpha_lasso = None
min_mse_lasso = float('inf')

for alpha in alphas:
    # Fit Lasso model
    lasso = Lasso(alpha=alpha)
    mse_lasso = prediction_accuracy(X_train, y_train, lasso)

    # Update best alpha and minimum MSE for Lasso
    if mse_lasso < min_mse_lasso:
        min_mse_lasso = mse_lasso
        best_alpha_lasso = alpha

print('Min MSE lasso ' + str(min_mse_lasso))
print('Optimal_alpha' + str(best_alpha_lasso))

## Prediction

In [17]:
# Linear regression prediction
y_pred = lin_model.predict(X_test)

new_df = pd.DataFrame()
new_df['id'] = df[df['train_dummy'] == 0]['id']
new_df['pred'] = y_pred

In [19]:
# Impletmenting LASSO with optimal alpha
lasso = Lasso(alpha=best_alpha_lasso)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

new_df = pd.DataFrame()
new_df['id'] = df[df['train_dummy'] == 0]['id']
new_df['pred'] = y_pred

  model = cd_fast.enet_coordinate_descent(


#### Exporting data

In [18]:
new_df.to_csv('C:/Users/gatla/OneDrive/BSE/Computational_machine_learning/Project_1/neighborhood_target_encoded.csv', index=False)