## Importing Libraries

In [47]:
import pandas as pd
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import LabelEncoder

## Importing data

## Prediction

In [48]:
# Filling data path
# train_data_path = "./data/train.csv"

# # reading CSV file
# df = pd.read_csv(train_data_path)
# df.drop_duplicates(inplace=True)


## Submission
##### Splitting feature and target data and concatenating the Dataframes

In [49]:
# Filling data path
train_data_path = "./data/train.csv"
test_data_path = "./data/test.csv"

# reading CSV file
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)

# Creating column to differentiate df_test and df_train
df_train['train_dummy'] = 1
df_test['train_dummy'] = 0

df = pd.concat([df_train, df_test], axis=0).sort_values("id").reset_index()
print(df.shape)


(10000, 17)


## Data cleaning
- Dropping columns
- Creating Floor 1 Dummy
- Removing outliers from the num_rooms and square_meters

In [50]:
# Dropping orientation (argue saying that this is hardly inputer and has a 30% of missing data) 
df.drop('orientation', axis=1, inplace=True)

# Creating floor variable
df[['floor', 'door_num']] = df['door'].str.split('-', n=1, expand=True)
df['floor'] = df['floor'].str[0]
df["floor"] = pd.to_numeric(df["floor"])

# Dropping door and door_num columns (justify: not influential)
df.drop(['door', 'door_num'], axis=1, inplace=True)

# Replacing the outliers with NaN in the number of rooms (justify cutoff value: outliers are very high above 10)
df['num_rooms'] = df['num_rooms'].apply(lambda x: x if x<10 else np.nan)

# Replacing the values of square metres < 40 with NaN (change the cutoff value and see the results)
df.loc[df['square_meters'] < 0, 'square_meters'] = np.nan

## Standardizing

In [51]:
# Standardization
to_standardize = ['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes', 'price', 'floor']

for i in to_standardize:
    df[i] = (df[i] - np.mean(df[i])) / np.std(df[i])

### KNN all variables, aside from num_supermarkets and binaries

In [52]:
vars_to_impute = ['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes', 'price', 'floor', 'has_ac', 'accepts_pets', 'is_furnished', 'has_pool']
impute_df = df[vars_to_impute]

imputer = KNNImputer(n_neighbors=7)
imputed_data = imputer.fit_transform(impute_df)
imputed_df = pd.DataFrame(imputed_data, columns = impute_df.columns)

for var in vars_to_impute:
    df[var] = imputed_df[var]

#### Imputing neighborhood based on the number of crimes

In [53]:
crimes_by_neighborhood = df.groupby('neighborhood')['num_crimes'].mean().to_dict()

# Function to be used in the imputation process
def find_closest_key(target, dictionary):
    closest_key = None
    closest_difference = float('inf')

    for key, value in dictionary.items():
        difference = abs(target - value)
        if difference < closest_difference:
            closest_key = key
            closest_difference = difference

    return closest_key

# Imputing neighborhood based on number of crimes
def impute_neighborhood(row):
    crimes = row['num_crimes']
    neighborhood = row['neighborhood']
    if pd.isnull(neighborhood):
        return find_closest_key(crimes, crimes_by_neighborhood)
    else:
        return neighborhood
    
df['neighborhood'] = df.apply(impute_neighborhood, axis=1)

#### Imputing number of supermarkets with mean of the neighborhood

In [54]:
neighborhood_means = df.groupby('neighborhood')['num_supermarkets'].transform('mean')
df['num_supermarkets'].fillna(neighborhood_means, inplace=True)

#### Creating floor 1 dummy

In [55]:
df['floor_one_dummy'] = df['floor'].apply(lambda x: True if x==1 else False)

#### Encoding neighborhood

In [56]:
df = pd.get_dummies(df, columns=['neighborhood'], prefix=['neigh']).reset_index(drop=True)

#### Seperating the dfs for training

In [57]:
#df_train = df[df['train_dummy'] == 1]
#df_test = df[df['train_dummy'] == 0]

## Model Training

#### Train model without binaries

In [58]:
#df_no_binary = df_train[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'floor_one_dummy', 'num_supermarkets', 'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta', 'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu', 'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi']]

#y_train = df_train['price']
#x_train = df_no_binary

#model_no_binary = LinearRegression()
#model_no_binary.fit(x_train, y_train)

#### Train model with all variables

In [59]:
y_train = df[df.train_dummy == 1]['price']
x_train = df[df.train_dummy == 1][['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'floor_one_dummy', 'num_supermarkets', 'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta', 'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu', 'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi', 'is_furnished', 'has_pool', 'has_ac', 'accepts_pets']]

model = LinearRegression()
model.fit(x_train, y_train)

## Prediction

In [60]:
# # Subsetting test data for binary variables missing
# binary_cols = ['is_furnished', 'has_pool', 'has_ac', 'accepts_pets']
# df_missing = df_test[df_test[binary_cols].isna().any(axis=1)]
# df_not_missing = df_test[~df_test[binary_cols].isna().any(axis=1)]

# # Drop binaries from df missing
# df_missing.drop(binary_cols, axis=1, inplace=True)

# # Prediction for df_not_missing
# x_test = df_not_missing[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'neighborhood_crime_encoded', 'is_furnished', 'has_pool', 'num_crimes', 'has_ac', 'accepts_pets', 'floor_one_dummy']]
# y_pred_not_missing = model.predict(x_test)

# df_not_missing['pred'] = y_pred_not_missing

# # Prediction for df_missing
# x_test = df_missing[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'neighborhood_crime_encoded', 'floor_one_dummy']]
# y_pred_missing = model_no_binary.predict(x_test)

# df_missing['pred'] = y_pred_missing
# new_df = pd.DataFrame()

# # Creating final DataFrame
# new_df['id'] = df_missing['id'].tolist() + df_not_missing['id'].tolist()
# new_df['pred'] = df_missing['pred'].tolist() + df_not_missing['pred'].tolist()


In [61]:
x_test = df[df.train_dummy == 0][['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'floor_one_dummy', 'num_supermarkets', 'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta', 'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu', 'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi', 'is_furnished', 'has_pool', 'has_ac', 'accepts_pets']]
y_pred = model.predict(x_test)

new_df = pd.DataFrame()
new_df['id'] = df_test['id']
new_df['pred'] = y_pred

#### Exporting data

In [62]:
new_df.to_csv('C:/Users/gatla/OneDrive/BSE/Computational_machine_learning/Project_1/improved_method.csv', index=False)