In [38]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

## Angelo above here, Ollie below here

In [39]:
train_data_path = "./data/train.csv"

# Reading CSV file
df = pd.read_csv(train_data_path)

### KNN all variables, aside from num_supermarkets and binaries

In [40]:
# FLOOR NEEDS TO BE ADDED IN THE BELOW
vars_to_impute = ['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes', 'price']
impute_df = df[vars_to_impute]

imputer = KNNImputer(n_neighbors=7)
imputed_data = imputer.fit_transform(impute_df)
imputed_df = pd.DataFrame(imputed_data, columns = impute_df.columns)

for var in vars_to_impute:
    df[var] = imputed_df[var]

#### Imputing neighborhood based on the number of crimes

In [41]:
crimes_by_neighborhood = df.groupby('neighborhood')['num_crimes'].mean().to_dict()

# Function to be used in the imputation process
def find_closest_key(target, dictionary):
    closest_key = None
    closest_difference = float('inf')

    for key, value in dictionary.items():
        difference = abs(target - value)
        if difference < closest_difference:
            closest_key = key
            closest_difference = difference

    return closest_key

# Imputing neighborhood based on number of crimes
def impute_neighborhood(row):
    crimes = row['num_crimes']
    neighborhood = row['neighborhood']
    if pd.isnull(neighborhood):
        return find_closest_key(crimes, crimes_by_neighborhood)
    else:
        return neighborhood
    
df['neighborhood'] = df.apply(impute_neighborhood, axis=1)

#### Imputing number of supermarkets with mean of the neighborhood

In [42]:
neighborhood_means = df.groupby('neighborhood')['num_supermarkets'].transform('mean')
df['num_supermarkets'].fillna(neighborhood_means, inplace=True)

#### Creating floor 1 dummy

In [35]:
#df['floor_one_dummy'] = df['floor'].apply(lambda x: True if x==1 else False)

KeyError: 'floor'

#### Encoding neighborhood

In [43]:
df = pd.get_dummies(df, columns=['neighborhood'], prefix=['neigh'])

In [44]:
df.columns

Index(['id', 'num_rooms', 'num_baths', 'square_meters', 'orientation',
       'year_built', 'door', 'is_furnished', 'has_pool', 'num_crimes',
       'has_ac', 'accepts_pets', 'num_supermarkets', 'price',
       'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta',
       'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu',
       'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi'],
      dtype='object')

#### Seperating the dfs for training

In [None]:
# Add code to seperate dataframes

## Model Training

#### Train model without binaries

In [None]:
df_no_binary = df_train[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'neighborhood_crime_encoded', 'floor_one_dummy', 'num_supermarkets', 'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta', 'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu', 'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi']]

y_train = df_train['price']
x_train = df_no_binary

model_no_binary = LinearRegression()
model_no_binary.fit(x_train, y_train)

In [46]:
df.columns

Index(['id', 'num_rooms', 'num_baths', 'square_meters', 'orientation',
       'year_built', 'door', 'is_furnished', 'has_pool', 'num_crimes',
       'has_ac', 'accepts_pets', 'num_supermarkets', 'price',
       'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta',
       'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu',
       'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi'],
      dtype='object')

#### Train model with all variables

In [None]:
y_train = df_train['price']
x_train = df_train[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'neighborhood_crime_encoded', 'floor_one_dummy', 'num_supermarkets', 'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta', 'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu', 'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi', 'is_furnished', 'has_pool', 'has_ac', 'accepts_pets']]

model = LinearRegression()
model.fit(x_train, y_train)

## Prediction

In [None]:
# Subsetting test data for binary variables missing
binary_cols = ['is_furnished', 'has_pool', 'has_ac', 'accepts_pets']
df_missing = df_test[df_test[binary_cols].isna().any(axis=1)]
df_not_missing = df_test[~df_test[binary_cols].isna().any(axis=1)]

# Drop binaries from df missing
df_missing.drop(binary_cols, axis=1, inplace=True)

# Prediction for df_not_missing
x_test = df_not_missing[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'neighborhood_crime_encoded', 'is_furnished', 'has_pool', 'num_crimes', 'has_ac', 'accepts_pets', 'floor_one_dummy']]
y_pred_not_missing = model.predict(x_test)

df_not_missing['pred'] = y_pred_not_missing

# Prediction for df_missing
x_test = df_missing[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'neighborhood_crime_encoded', 'floor_one_dummy']]
y_pred_missing = model_no_binary.predict(x_test)

df_missing['pred'] = y_pred_missing
new_df = pd.DataFrame()

# Creating final DataFrame
new_df['id'] = df_missing['id'].tolist() + df_not_missing['id'].tolist()
new_df['pred'] = df_missing['pred'].tolist() + df_not_missing['pred'].tolist()
