# First import all libraries that are going to be used

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.cluster import KMeans

# Install kaggle API and download data from competition

In [None]:
#!pip install kaggle
#!kaggle competitions download -c el-algoritmo-es-correcto

# Read csv as pandas DataFrame

In [None]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')
example = pd.read_csv('example_submission.csv')

# Divide each column as dummy, float or string

In [None]:
dummy_var = ['ascensor','balcon','conjuntocerrado','cuartoservicio','deposito','estudio','gimnasio',
            'halldealcobas','parqueaderovisitantes','piscina','porteria','remodelado','saloncomunal',
            'terraza','vigilancia','vista','zonalavanderia']

float_var = ['area','banos','banoservicio','estrato','garajes','habitaciones','piso','valoradministracion',
            'valorventa']

string_var = ['tiempodeconstruido','vista','tipoinmueble','tiponegocio']

# Apply general filter. Fillna with 0's for dummy and float variables and dropna.

In [None]:
train_data[dummy_var] = train_data[dummy_var].fillna(0)
train_data['banos'] = train_data['banos'].fillna(1)
train_data['habitaciones'] = train_data['habitaciones'].fillna(1)
train_data['piso'] = train_data['piso'].fillna(1)
train_data[float_var] = train_data[float_var].fillna(0)
test_data[dummy_var] = test_data[dummy_var].fillna(0)
test_data['banos'] = test_data['banos'].fillna(1)
test_data['habitaciones'] = test_data['banos'].fillna(1)
test_data['piso'] = test_data['piso'].fillna(1)
test_data[float_var] = test_data[float_var].fillna(0)
train_data = train_data.dropna()
test_data = test_data.dropna()

# Specific filters for float variables

In [None]:
q_up = 0.95
q_down = 0.005

In [None]:
q_banos = train_data["banos"].quantile(q_up)
q_banos1 = train_data['banos'].quantile(q_down)
train_data = train_data[(train_data["banos"] <= q_banos) & (train_data['banos'] > 0)]
boxplot = train_data.boxplot(column=['banos'])

In [None]:
q_va = train_data['valoradministracion'].quantile(q_up)
train_data = train_data[train_data["valoradministracion"] <= q_va]
boxplot = train_data.boxplot(column=['valoradministracion'])

In [None]:
q_vv = train_data['valorventa'].quantile(q_up)
q_vv1 = train_data['valorventa'].quantile(q_down)
train_data = train_data[(train_data["valorventa"] <= q_vv) & (train_data['valorventa'] >= q_vv1)]
boxplot = train_data.boxplot(column=['valorventa'])

In [None]:
q_a = train_data['area'].quantile(q_up)
q_a1 = train_data['area'].quantile(q_down)
train_data = train_data[(train_data["area"] <= q_a) & (train_data['area'] >= q_a1)]
boxplot = train_data.boxplot(column=['area'])

In [None]:
q_h = train_data['habitaciones'].quantile(q_up)
q_h1 = train_data['habitaciones'].quantile(q_down)
train_data = train_data[(train_data["habitaciones"] <= q_h) & (train_data['habitaciones'] >= q_h1)]
boxplot = train_data.boxplot(column=['habitaciones'])

In [None]:
q_g = train_data['garajes'].quantile(q_up)
train_data = train_data[train_data["garajes"] <= q_g]
boxplot = train_data.boxplot(column=['garajes'])

In [None]:
hist_vv = train_data['valorventa']
sn.set_style("whitegrid")
sn.distplot(hist_vv)
plt.show()

train_data["valorventa_log"] = np.log(train_data['valorventa'])
hist_vvlog = train_data['valorventa_log']
sn.distplot(hist_vvlog)
plt.show()

test_data['valorventa_log'] = np.log(test_data['valorventa'])

In [None]:
hist_a = train_data['area']
sn.set_style("whitegrid")
sn.distplot(hist_a)
plt.show()

train_data['area_log'] = np.log(train_data['area'])
hist_alog = train_data['area_log']
sn.distplot(hist_alog)
plt.show()

test_data['area_log'] = np.log(test_data['area'])

# Modify string variables

In [None]:
#test_data = test_data.dropna()
train_data['vista'] = train_data['vista'].replace(0,'Interior')
test_data['vista'] = test_data['vista'].replace(0,'Interior')
train_data['tiponegocio'] = train_data['tiponegocio'].replace('Venta y arriendo','Venta Y Arriendo')
train_data['tiempodeconstruido'] = train_data['tiempodeconstruido'].replace('ntre 0 y 5 años','Entre 0 y 5 años')
train_data['valor/m2'] = train_data['valorventa'] / train_data['area']
train_data['valor_habi'] = train_data['valor/m2']*0.95
train_data

# Apply Ordinal Encoder to String columns

In [None]:
enc = OrdinalEncoder()

X_tn = train_data['tiponegocio'].to_numpy().reshape(len(train_data),1)
X_transform_tn = enc.fit_transform(X_tn)
train_data['tiponegocio_int'] = X_transform_tn

X_v = train_data['vista'].to_numpy().reshape(len(train_data),1)
X_transform_v = enc.fit_transform(X_v)
train_data['vista_int'] = X_transform_v

X_ti = train_data['tipoinmueble'].to_numpy().reshape(len(train_data),1)
X_transform_ti = enc.fit_transform(X_ti)
train_data['tipoinmueble_int'] = X_transform_ti

X_tc = train_data['tiempodeconstruido'].to_numpy().reshape(len(train_data),1)
X_transform_tc = enc.fit_transform(X_tc)
train_data['tiempodeconstruido_int'] = X_transform_tc

In [None]:
X_ttn = test_data['tiponegocio'].to_numpy().reshape(len(test_data),1)
X_transform_ttn = enc.fit_transform(X_ttn)
test_data['tiponegocio_int'] = X_transform_ttn

X_tv = test_data['vista'].to_numpy().reshape(len(test_data),1)
X_transform_tv = enc.fit_transform(X_tv)
test_data['vista_int'] = X_transform_tv

X_tti = test_data['tipoinmueble'].to_numpy().reshape(len(test_data),1)
X_transform_tti = enc.fit_transform(X_tti)
test_data['tipoinmueble_int'] = X_transform_tti

X_ttc = test_data['tiempodeconstruido'].to_numpy().reshape(len(test_data),1)
X_transform_ttc = enc.fit_transform(X_ttc)
test_data['tiempodeconstruido_int'] = X_transform_ttc

In [None]:
train_data_2 = train_data.select_dtypes(exclude = 'object')
test_data_2 = test_data.select_dtypes(exclude = 'object')
train_data_2 = train_data_2.drop(columns = ['area','id','valorventa'])
test_data_2 = test_data_2.drop(columns = ['area','valorventa'])

In [None]:
columns = train_data_2.columns.tolist()
train_columns = columns[:-6] + columns[-4:]
test_columns = columns[-5]
X = train_data_2[train_columns].to_numpy()
Y = train_data_2[test_columns].to_numpy()
Y = Y.reshape(Y.shape[0],)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)
#Linear regression model
reg = LinearRegression().fit(X_train, y_train)
Y_pred = reg.predict(X_test)
mape = mean_absolute_percentage_error(y_test,Y_pred)
print(mape)

In [None]:
#Test model
columns_test = test_data_2.columns.tolist()
X_pred_cols = columns_test[1:]
X_pred = test_data_2[X_pred_cols].to_numpy()
Y_pred_test = reg.predict(X_pred)
test_data_2['valor_mt2_predicted'] = np.round(Y_pred_test.tolist(),3)
submission = pd.DataFrame()
submission['id'] = test_data_2['id']
submission['valormt2_predicted'] = test_data_2['valor_mt2_predicted']
#submission.to_csv('submission.csv',index = False, decimal = '.',sep = ',')
submission