## Importing Libraries

In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

## Importing data

## Prediction

In [2]:
# Filling data path
# train_data_path = "./data/train.csv"

# # reading CSV file
# df = pd.read_csv(train_data_path)
# df.drop_duplicates(inplace=True)


## Submission
##### Splitting feature and target data and concatenating the Dataframes

In [3]:
# Filling data path
train_data_path = "./data/train.csv"
test_data_path = "./data/test.csv"

# reading CSV file
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)

# Creating column to differentiate df_test and df_train
df_train['train_dummy'] = 1
df_test['train_dummy'] = 0

df = pd.concat([df_train, df_test], axis=0).sort_values("id").reset_index()
print(df.shape)


(10000, 17)


## Data cleaning
- Dropping columns
- Creating Floor 1 Dummy
- Removing outliers from the num_rooms and square_meters

In [4]:
# Dropping orientation (argue saying that this is hardly inputer and has a 30% of missing data) 
df.drop('orientation', axis=1, inplace=True)

# Creating floor variable
df[['floor', 'door_num']] = df['door'].str.split('-', n=1, expand=True)
df['floor'] = df['floor'].str[0]
df["floor"] = pd.to_numeric(df["floor"])

# Dropping door and door_num columns (justify: not influential)
df.drop(['door', 'door_num'], axis=1, inplace=True)

# Replacing the outliers with NaN in the number of rooms (justify cutoff value: outliers are very high above 10)
df['num_rooms'] = df['num_rooms'].apply(lambda x: x if x<10 else np.nan)

# Replacing the values of square metres < 40 with NaN (change the cutoff value and see the results)
df.loc[df['square_meters'] < 0, 'square_meters'] = np.nan

## Standardizing and imputing

In [5]:
df

Unnamed: 0,index,id,num_rooms,num_baths,square_meters,year_built,is_furnished,has_pool,neighborhood,num_crimes,has_ac,accepts_pets,num_supermarkets,price,train_dummy,floor
0,597,1,3.0,3.0,126.0,2009.0,True,True,Sant Martí,9.0,True,True,,,0,6.0
1,6521,2,4.0,3.0,154.0,2008.0,True,True,Horta,0.0,True,True,,1348.0,1,2.0
2,3404,3,1.0,3.0,140.0,2000.0,False,True,Sants,6.0,,False,,1285.0,1,4.0
3,481,4,3.0,2.0,50.0,1984.0,False,False,Sant Andreu,0.0,False,False,,,0,1.0
4,7628,5,3.0,3.0,106.0,1959.0,True,True,Gràcia,1.0,True,True,,1156.0,1,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3983,9996,4.0,3.0,51.0,1972.0,False,False,Nou Barris,3.0,False,True,2.0,875.0,1,4.0
9996,1028,9997,3.0,3.0,134.0,1994.0,False,False,Ciutat Vella,0.0,True,False,3.0,1108.0,1,5.0
9997,6572,9998,2.0,1.0,83.0,1950.0,False,False,Les Cors,0.0,True,True,,1083.0,1,9.0
9998,1767,9999,2.0,2.0,76.0,1998.0,False,False,Nou Barris,4.0,True,True,3.0,,0,1.0


In [6]:
# Standardization
def imputing_missing(impute_df, vars_to_impute):
    standardizer = StandardScaler()
    impute_df[vars_to_impute] = standardizer.fit_transform(df[vars_to_impute])
    imputer = KNNImputer(n_neighbors=7)
    impute_df[vars_to_impute] = imputer.fit_transform(df[vars_to_impute])
    impute_df[vars_to_impute] = standardizer.inverse_transform(df[vars_to_impute])
    return impute_df

vars_to_impute = ['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes', 'price', 'floor', 'has_ac', 'accepts_pets', 'is_furnished', 'has_pool']
df = imputing_missing(df, vars_to_impute)

In [7]:
df

Unnamed: 0,index,id,num_rooms,num_baths,square_meters,year_built,is_furnished,has_pool,neighborhood,num_crimes,has_ac,accepts_pets,num_supermarkets,price,train_dummy,floor
0,597,1,3.0,3.0,126.0,2009.0,1.0,1.0,Sant Martí,9.0,1.000000,1.0,,994.428571,0,6.0
1,6521,2,4.0,3.0,154.0,2008.0,1.0,1.0,Horta,0.0,1.000000,1.0,,1348.000000,1,2.0
2,3404,3,1.0,3.0,140.0,2000.0,0.0,1.0,Sants,6.0,0.571429,0.0,,1285.000000,1,4.0
3,481,4,3.0,2.0,50.0,1984.0,0.0,0.0,Sant Andreu,0.0,0.000000,0.0,,942.428571,0,1.0
4,7628,5,3.0,3.0,106.0,1959.0,1.0,1.0,Gràcia,1.0,1.000000,1.0,,1156.000000,1,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3983,9996,4.0,3.0,51.0,1972.0,0.0,0.0,Nou Barris,3.0,0.000000,1.0,2.0,875.000000,1,4.0
9996,1028,9997,3.0,3.0,134.0,1994.0,0.0,0.0,Ciutat Vella,0.0,1.000000,0.0,3.0,1108.000000,1,5.0
9997,6572,9998,2.0,1.0,83.0,1950.0,0.0,0.0,Les Cors,0.0,1.000000,1.0,,1083.000000,1,9.0
9998,1767,9999,2.0,2.0,76.0,1998.0,0.0,0.0,Nou Barris,4.0,1.000000,1.0,3.0,834.428571,0,1.0


In [8]:
# Re-standardising variables
to_standardize = ['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes', 'floor']

for i in to_standardize:
    df[i] = (df[i] - np.mean(df[i])) / np.std(df[i])

### KNN all variables, aside from num_supermarkets and binaries

In [9]:
# vars_to_impute = ['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes', 'price', 'floor', 'has_ac', 'accepts_pets', 'is_furnished', 'has_pool']
# impute_df = df[vars_to_impute]

# imputer = KNNImputer(n_neighbors=7)
# imputed_data = imputer.fit_transform(impute_df)
# imputed_df = pd.DataFrame(imputed_data, columns = impute_df.columns)

# for var in vars_to_impute:
#     df[var] = imputed_df[var]

#### Imputing neighborhood based on the number of crimes

In [10]:
# crimes_by_neighborhood = df.groupby('neighborhood')['num_crimes'].mean().to_dict()

# # Function to be used in the imputation process
# def find_closest_key(target, dictionary):
#     closest_key = None
#     closest_difference = float('inf')

#     for key, value in dictionary.items():
#         difference = abs(target - value)
#         if difference < closest_difference:
#             closest_key = key
#             closest_difference = difference

#     return closest_key

# # Imputing neighborhood based on number of crimes
# def impute_neighborhood(row):
#     crimes = row['num_crimes']
#     neighborhood = row['neighborhood']
#     if pd.isnull(neighborhood):
#         return find_closest_key(crimes, crimes_by_neighborhood)
#     else:
#         return neighborhood
    
# df['neighborhood'] = df.apply(impute_neighborhood, axis=1)

In [11]:
# Imputing with mode
def impute_categorical_columns(df):

    # List of categorical and boolean column names
    categorical_columns = ['neighborhood']

    # Perform mode imputation for each categorical and boolean column
    for column_name in categorical_columns:
        mode_value = df[column_name].mode()[0]  # Calculate the mode
        df[column_name].fillna(mode_value, inplace=True)  # Fill missing values with the mode
    return df


df = impute_categorical_columns(df)

#### Imputing number of supermarkets with mean of the neighborhood

In [12]:
neighborhood_means = df.groupby('neighborhood')['num_supermarkets'].mean()
df['num_supermarkets'] = df['neighborhood'].map(neighborhood_means)

#### Creating floor 1 dummy

In [13]:
df['floor_one_dummy'] = df['floor'].apply(lambda x: True if x==1 else False)

#### Encoding neighborhood

In [14]:
df = pd.get_dummies(df, columns=['neighborhood'], prefix=['neigh']).reset_index(drop=True)

#### Seperating the dfs for training

In [15]:
df_train = df[df['train_dummy'] == 1].sort_values('id')
df_test = df[df['train_dummy'] == 0].sort_values('id')

## Model Training

#### Train model without binaries

In [168]:
#df_no_binary = df_train[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'floor_one_dummy', 'num_supermarkets', 'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta', 'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu', 'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi']]

#y_train = df_train['price']
#x_train = df_no_binary

#model_no_binary = LinearRegression()
#model_no_binary.fit(x_train, y_train)

#### Train model with all variables

In [16]:
y_train = df_train['price']
x_train = df_train[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'floor_one_dummy', 'num_supermarkets', 'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta', 'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu', 'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi', 'is_furnished', 'has_pool', 'has_ac', 'accepts_pets']]

model = LinearRegression()
model.fit(x_train, y_train)

## Prediction

In [170]:
# # Subsetting test data for binary variables missing
# binary_cols = ['is_furnished', 'has_pool', 'has_ac', 'accepts_pets']
# df_missing = df_test[df_test[binary_cols].isna().any(axis=1)]
# df_not_missing = df_test[~df_test[binary_cols].isna().any(axis=1)]

# # Drop binaries from df missing
# df_missing.drop(binary_cols, axis=1, inplace=True)

# # Prediction for df_not_missing
# x_test = df_not_missing[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'neighborhood_crime_encoded', 'is_furnished', 'has_pool', 'num_crimes', 'has_ac', 'accepts_pets', 'floor_one_dummy']]
# y_pred_not_missing = model.predict(x_test)

# df_not_missing['pred'] = y_pred_not_missing

# # Prediction for df_missing
# x_test = df_missing[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'neighborhood_crime_encoded', 'floor_one_dummy']]
# y_pred_missing = model_no_binary.predict(x_test)

# df_missing['pred'] = y_pred_missing
# new_df = pd.DataFrame()

# # Creating final DataFrame
# new_df['id'] = df_missing['id'].tolist() + df_not_missing['id'].tolist()
# new_df['pred'] = df_missing['pred'].tolist() + df_not_missing['pred'].tolist()


In [18]:
x_test = df_test[['num_rooms', 'num_baths', 'square_meters', 'year_built', 'floor', 'num_crimes', 'floor_one_dummy', 'num_supermarkets', 'neigh_Ciutat Vella', 'neigh_Eixample', 'neigh_Gràcia', 'neigh_Horta', 'neigh_Les Cors', 'neigh_Nou Barris', 'neigh_Sant Andreu', 'neigh_Sant Martí', 'neigh_Sants', 'neigh_Sarrià-Sant Gervasi', 'is_furnished', 'has_pool', 'has_ac', 'accepts_pets']]
y_pred = model.predict(x_test)

new_df = pd.DataFrame()
new_df['id'] = df_test['id']
new_df['pred'] = y_pred

#### Exporting data

In [19]:
new_df.to_csv('C:/Users/gatla/OneDrive/BSE/Computational_machine_learning/Project_1/improved_method.csv', index=False)