In [516]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# Imputing with MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import linear_model

In [None]:
# Read in the data
train = pd.read_csv('./data/properties_colombia_train.csv', sep = ',')
test = pd.read_csv('./data/properties_colombia_test.csv', sep = ',')
cotizacion = pd.read_csv('./data/cotizacionCOP.csv', sep = ',', usecols=[0,1], header = 0, names = ['Fecha', 'Cierre'])

In [None]:
### Hay solo 8 valores en USD por lo que los transformo a COP para borrar esa feature
### Busco cotizaciones online de COP aqui: https://es.investing.com/currencies/usd-cop-historical-data
# Clean cotizacion
cotizacion['Fecha'] = pd.to_datetime(cotizacion['Fecha'], dayfirst=True)
cotizacion['Cierre'] = cotizacion['Cierre'].str.replace('.','').str.replace(',','.').astype(float)

In [None]:
#Trims spaces in all columns where it is a string
def TrimColumns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

#Normalize the strings columns by removing special characters and accents
def NormalizeColumn(df, column_name):
    df[column_name] = df[column_name].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    return df[column_name]

#https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
#Removes html tags from strings
def CleanValues(series, to_replace, value = '', regex = True):
    for i in to_replace:
        series = series.str.replace(i, value, regex=regex)
    return series

#Patterns to extract from strings
pattern = '|'.join(['\n','\r', '\t' ,'\xa0','\u200b',','])
pattern2 = '|'.join(['<.*?>','{.*?}'])


#Extracts info from strings   
def StringExtract(df):
    df = TrimColumns(df)
    df['title'] = NormalizeColumn(df, 'title')
    df['description'] = NormalizeColumn(df, 'description')

    #Lower and strip all strings
    df['title'] = df['title'].str.lower().str.strip()
    df['description'] = df['description'].str.lower().str.strip()

    #Remove html tags with both patterns
    df['title'] = CleanValues(df['title'], pattern)
    df['description'] = CleanValues(df['description'], pattern)
    df['title'] = CleanValues(df['title'], pattern2, regex = True)
    df['description'] = CleanValues(df['description'], pattern2, regex = True)

     ### REGEX feature extraction
    #### Se extrae info de las siguientes variables: M2, nro baños, nro habitaciones
    #piscina, vigilancia 24hs, patio, parqueadero/garage, balcon, cancha, gimnasio/gym, saunsa, estrenar, condominio. 

    #MTS cuadrados extraction with regex pattern
    regex = r"(\d+(?=m2| m2| mts2| metros cuadrados| mts))"
    # True values means that M2 information is contained in the string
    mask1 = df.loc[:,'title'].str.extract(regex, expand = False).notna()
    mask2 = df.loc[:,'description'].str.extract(regex, expand = False).notna()

    df.loc[mask1, 'surface_total'] = df.loc[mask1, 'title'].str.extract(regex, expand = False)
    df.loc[mask2, 'surface_total'] = df.loc[mask2, 'description'].str.extract(regex, expand = False)

    #Pileta extraction with regex pattern
    value = '|'.join(['pileta', 'piscina','natatorio'])
    df['pileta'] = df['description'].str.contains(value, regex = True, case = False)
    
    #Vigilancia extraction with regex pattern
    value = '|'.join(['vigilancia 24', 'porteria 24', 'seguridad 24', 'vigilancia las 24', 'porteria las 24', 'seguridad las 24'])
    df['vigilancia'] = df['description'].str.contains(value, regex = True, case = False)

    #Patio extraction with regex pattern
    value = '|'.join(['patio', 'jardin', 'parque'])
    df['patio'] = df['description'].str.contains(value, regex = True, case = False)
    
    #Garage extraction with regex pattern
    value = '|'.join(['garage', 'garaje', 'cochera', 'parquedero'])
    df['garage'] = df['description'].str.contains(value, regex = True, case = False)

    #Balcon extraction with regex pattern
    value = '|'.join(['balcon', 'balcn', 'valcon'])
    df['balcon'] = df['description'].str.contains(value, regex = True, case = False)

    #Cancha extraction with regex pattern
    value = '|'.join(['cancha'])
    df['cancha'] = df['description'].str.contains(value, regex = True, case = False)

    #Gym extraction with regex pattern
    value = '|'.join(['gimnasio', 'gym', 'gim', 'fitnes'])
    df['gimnasio'] = df['description'].str.contains(value, regex = True, case = False)

    #Sauna extraction with regex pattern
    value = '|'.join(['sauna', 'solarium', 'ducha turca', 'ducha escocesa'])
    df['sauna'] = df['description'].str.contains(value, regex = True, case = False)

    #Estrenar extraction with regex pattern
    value = '|'.join(['estrenar', 'estreno'])
    df['a_estrenar'] = df['description'].str.contains(value, regex = True, case = False)

    #Extract nro baños with regex pattern
    regex = r"(\d+(?=bano| bano))"
    mask = train.loc[:,'bathrooms'].isna()
    train.loc[mask, 'bathrooms'] = train.loc[mask, 'description'].str.extract(regex, expand = False)
    
    #Extract nro habitaciones with regex pattern
    regex = r'(\d+(?=habitac| habitac |alcob| alcob|cuart| cuart))'
    mask = train.loc[:,'bedrooms'].isna()
    train.loc[mask, 'bedrooms'] = train.loc[mask, 'description'].str.extract(regex, expand = False)
    
    return df

In [None]:
def Encoder(df):
   #Fix formats
    tofloat = ['bedrooms','bathrooms','surface_total']
    df[tofloat] = df[tofloat].astype(float)

    # Create a boolean mask for categorical columns
    categorical_mask = (df.dtypes == object)

    # Get list of categorical column names
    categorical_columns = df.columns[categorical_mask].tolist()


    # Create LabelEncoder object: le
    le = LabelEncoder()

    # Apply LabelEncoder to categorical columns
    df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))

    return df

def Imputator(df):
   lista_na = df.loc[:, df.isna().any()].columns.tolist()

   df_mice = df.filter(lista_na, axis=1).copy()

   # Define MICE Imputer and fill missing values
   mice_imputer = IterativeImputer(estimator=linear_model.BayesianRidge(), n_nearest_features=None, imputation_order='ascending')

   # Impute using fit_tranform on the data
   df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(df_mice), columns=df_mice.columns)

   df[df_mice_imputed.columns.tolist()] = df_mice_imputed

   return df

In [None]:
#Cleans df or test and returns the cleaned dataframe
def Cleaner(df):
    #Drop ID and put Unnamed as ID, its shorter on memory and all values are unique
    df.drop('id', axis=1, inplace=True)
    df.rename(columns={'Unnamed: 0':'id'}, inplace=True)
    df.set_index('id', inplace=True)

    #Merge with cotizacion and merge with df
    df = df.merge(cotizacion, left_on='created_on', right_on='Fecha', how='left')

    #Fill NA with previous value because cotizacion has only bussiness days close values
    mask_usd = df['currency'] == 'USD'
    df['Cierre'].fillna(method='ffill', inplace=True)
    #df.loc[mask_usd,['price', 'created_on','Fecha','Cierre']].sort_values(by='created_on', ascending=False)
    
    #Transform USD to COP by multipling
    df.loc[mask_usd,'price'] = df.loc[mask_usd,'price'].mul(df.loc[mask_usd,'Cierre'], axis = 0)

    #Extract info from title and description
    df = StringExtract(df)

    #Fix datetime format and make a new variable for days published
    df['end_date'] = pd.to_datetime(df['end_date'], errors = 'coerce')
    df['start_date'] = pd.to_datetime(df['start_date'], errors = 'coerce')
    df['days_published'] = df['end_date'] - df['start_date']
    df['days_published'] = df['days_published'].dt.days


    ### Borrado de variables: 
    #### L1: No aporta informacion, todo es Colombia. 
    #### L4, L5, L6: excesiva cantidad de missing values. Se podría obtener con coordenadas GPS
    #### ROOMS: información excesivamente dificultosa de extraer del texto de forma precisa
    #### Surface_covered: 90% missing, extracción del texto dificultosa.
    #### Currency = Todo excepto 8 son COP, se transformaron los precios
    #### Title y description: sin utilidad luego de extraer la info
    #### Operation_type: todo es venta
    #### Fecha y Cierre: Se usaron para convertir a COP los valores en USD
    #### geometry: mismos datos que lat y lon
    #### ad_type: todos venta
    #### created_on y end_date: alta cardinalidad, se genera variable dias_publicado

    todelete = ['l1','l4', 'l5', 'l6', 'rooms','surface_covered',  'price_period', 
                'currency', 'title', 'description', 'operation_type', 'Fecha', 'Cierre', 'geometry',
                'end_date', 'ad_type','created_on', 'end_date']

    df.drop(todelete, axis = 1, inplace = True)
    
    ####LABEL ENCODER
    df = Encoder(df)

    ####IMPUTATION with MICE

    ##############
    ##############
    ####FIX
    ####Impossible to make inverse_transform with LabelEncoder
    df = Imputator(df)

    df['price'] = df['price'].astype(int)
    
    return df

## Machine learning

### Classification Pipeline

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

### Data normalization

In [None]:
#Scaling data
X_train = train.drop("target", axis=1).values
y_train = train["target"].values
X_test = test.drop("target", axis=1).values
y_test = test["target"].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Choosing best classifier

In [None]:
#Looping with cross validation the 3 models
models = {"Logistic Regression": LogisticRegression(), "KNN": KNeighborsClassifier(),
          "Decision Tree": DecisionTreeClassifier()}
results = []
for model in models.values():
    kf = KFold(n_splits=6, random_state=42, shuffle=True)
    cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kf)
    results.append(cv_results)

#Plot with train results
plt.boxplot(results, labels=models.keys())
plt.show()

In [None]:
# Instantiate a VotingClassifier 'vc'
vc = VotingClassifier(estimators=classifiers)
# Fit 'vc' to the traing set and predict test set labels
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)
# Evaluate the test-set accuracy of 'vc'
print('Voting Classifier: {:.3f}'.format(accuracy_score(y_test, y_pred)))

## Hyperparameter tuning
### If the dataset is imbalanced, use the ROC AUC score as a metric instead of accuracy.

In [None]:
# Create steps
steps = [("scaler", StandardScaler()), 
         ("logreg", LogisticRegression())]

# Set up pipeline
pipeline = Pipeline(steps)
params = {"logreg__solver": ["newton-cg", "saga", "lbfgs"],
         "logreg__C": np.linspace(0.001, 1.0, 10)}

# Create the GridSearchCV object
tuning = GridSearchCV(pipeline, param_grid=params, scoring='roc_auc',cv=10,n_jobs=-1)
tuning.fit(X_train, y_train)
y_pred = tuning.predict(X_test)

# Compute and print performance
print("Tuned Logistic Regression Parameters: {}, AUC: {}".format(tuning.best_params_, tuning.score(X_test, y_test)))

## Final model

### Extract model from GridSearch

In [None]:
# Extract best model from 'grid_dt'
best_model = tuning.best_estimator_
# Evaluate test set accuracy
test_acc = best_model.score(X_test,y_test)
# Print test set accuracy
print("Test set accuracy of best model: {:.3f}".format(test_acc))

### Regular way of doing it

In [None]:
modelo = LogisticRegression(solver="newton-cg", C = 0.112)
modelo.fit(X_train, y_train)

print("Intercept:", modelo.intercept_)
print("Coeficiente:", list(zip(X_train, modelo.coef_.flatten(), )))
print("Accuracy de entrenamiento:", modelo.score(X_test, y_test))

## Metrics

In [None]:
#Test set perfomance
for name, model in models.items():
   model.fit(X_train_scaled, y_train)
   test_score = model.score(X_test_scaled, y_test)
   print("{} Test Set Accuracy: {}".format(name, test_score))

## Graphics for classification

In [None]:
from graficador import *
plot_classifier(X_train,y_train,model,proba=True) 

# Predict probabilities on training points
prob = model.predict_proba(X)
print("Maximum predicted probability", np.max(prob))

## Predictions

In [None]:
# Predicciones probabilísticas
# ==============================================================================
# Con .predict_proba() se obtiene, para cada observación, la probabilidad predicha
# de pertenecer a cada una de las dos clases.
predicciones_prob = modelo.predict_proba(X = X_test)
predicciones_prob = pd.DataFrame(predicciones_prob, columns = modelo.classes_)
print(predicciones_prob.tail(3))

# Predicciones con clasificación final
# ==============================================================================
# Con .predict() se obtiene, para cada observación, la clasificación predicha por
# el modelo. Esta clasificación se corresponde con la clase con mayor probabilidad.
predicciones = modelo.predict(X = X_test)
predicciones


from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, predicciones))
print(classification_report(y_test, predicciones))

## ROC

In [None]:
#Only execute once or will get error after
#y_test = np.where(y_test == "No", 0, 1)
#y_pred_probs = predicciones_prob.iloc[:,1].values

from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, y_pred_probs)
print(f'ROC under the curve is: {auc}')

# Support vector machines

In [None]:
from sklearn.svm import SVC, LinearSVC

In [None]:
# Instantiate an RBF SVM
svm = SVC()

# Instantiate the GridSearchCV object and run the search
parameters = {'C':[0.1, 1, 10], 'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(X_train, y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)

# Report the test accuracy using these best parameters
print("Test accuracy of best grid search hypers:", searcher.score(X_test, y_test))

In [None]:
from sklearn.svm import SVC, LinearSVC

# Set the regularization strength
model = SVC()

# Fit and plot
model.fit(X_train,y_train)

# Plot the decision boundary
plot_classifier(X_test,y_test,model)

## Stochastic gradient descent classifier

In [None]:
from sklearn.linear_model import SGDClassifier
# We set random_state=0 for reproducibility 
linear_classifier = SGDClassifier(random_state=0)

# Instantiate the GridSearchCV object and run the search
parameters = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
             'loss':['hinge', 'log_loss']}
searcher = GridSearchCV(linear_classifier, parameters, cv=10)
searcher.fit(X_train, y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)
print("Test accuracy of best grid search hypers:", searcher.score(X_test, y_test))

## Bagging Classifier

In [None]:
# Import models and utility functions
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Set seed for reproducibility
SEED = 1

# Split data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y,random_state=SEED)

#  Instantiate a classification-tree 'dt'
dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.16, random_state=SEED)
# Instantiate a BaggingClassifier 'bc'
bc = BaggingClassifier(base_estimator=dt, n_estimators=300, n_jobs=-1)
# Fit 'bc' to the training set
bc.fit(X_train, y_train)
# Predict test set labels
y_pred = bc.predict(X_test)
# Evaluate and print test-set accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of Bagging Classifier: {:.3f}'.format(accuracy))

In [None]:
# Extract the OOB accuracy from 'bc'
oob_accuracy = bc.oob_score_

# Print OOB accuracy
print('OOB accuracy: {:.3f}'.format(oob_accuracy))

## Random Forest Classifier

### Russian way

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

class Experiment():
    
    def __init__(self, train, validation, target='target'):
        self.train = train
        self.validation = validation
        self.target = target
        
    def run(self):
        model = RandomForestClassifier(n_jobs=8, n_estimators=200)
        model.fit(self.train.drop(columns=[self.target]), self.train[self.target])
        preds = model.predict(self.validation.drop(columns=[self.target]))
        error = accuracy_score(self.validation[target], preds, squared=False)
        return error

In [None]:
%%time
experiment1 = Experiment(train.fillna(-99).drop(columns=['id']), test.fillna(-99).drop(columns=['id']))
experiment1.run()

### Datacamp way

In [None]:
# Basic imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error as MSE

#Instantiate a random forests regressor 'rf' 400 estimators
rf = RandomForestClassifier(random_state=SEED)


In [None]:
# Define a grid of hyperparameter 'params_rf'
params_rf = {
'n_estimators': [300, 400, 500],
'max_depth': [4, 6, 8],
'min_samples_leaf': [0.1, 0.2],
'max_features': ['log2', 'sqrt']
}

# Instantiate 'grid_rf'
grid_rf = GridSearchCV(estimator=rf, param_grid=params_rf, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Fit 'grid_rf' to the training set
grid_rf.fit(X_train, y_train)

# Extract the best hyperparameters from 'grid_rf'
best_hyperparams = grid_rf.best_params_
print('Best hyperparameters:\n', best_hyperparams)

# Extract the best model from 'grid_rf'
best_model = grid_rf.best_estimator_
# Predict the test set labels
y_pred = best_model.predict(X_test)
# Evaluate the test set RMSE
accuracy = accuracy_score(y_test, y_pred)
# Print the test set RMSE
print('Test set RMSE of rf: {:.2f}'.format(accuracy))

In [None]:
# Basic imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error as MSE

#Instantiate a random forests regressor 'rf' 400 estimators
rf = RandomForestClassifier(n_estimators=400,min_samples_leaf=0.12,random_state=SEED)

# Fit 'rf' to the training set
rf.fit(X_train, y_train)

# Predict the test set labels 'y_pred'
y_pred = rf.predict(X_test)

### Feature importance

In [None]:
# Print how important each column is to the model
for i, item in enumerate(rf.feature_importances_):
      # Use i and item to print out the feature importance of each column
    print("{0:s}: {1:.2f}".format(X_train.columns[i], item))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create a pd.Series of features importances
importances_rf = pd.Series(rf.feature_importances_, index = X.columns)

# Sort importances_rf
sorted_importances_rf = importances_rf.sort_values()

# Make a horizontal bar plot
sorted_importances_rf.plot(kind='barh', color='lightgreen')
plt.show()

In [None]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the grid of hyperparameters 'params_dt'
params_dt = {
'max_depth': [3, 4,5, 6],
'min_samples_leaf': [0.04, 0.06, 0.08],
'max_features': [0.2, 0.4,0.6, 0.8]
}

# Instantiate a 10-fold CV grid search object 'grid_dt'
grid_dt = GridSearchCV(estimator=dt,param_grid=params_dt,scoring='accuracy',cv=10,n_jobs=-1)

# Fit 'grid_dt' to the training data
grid_dt.fit(X_train, y_train)

In [None]:
# Extract best hyperparameters from 'grid_dt'
best_hyperparams = grid_dt.best_params_
print('Best hyerparameters:\n', best_hyperparams)

# Extract best CV score from 'grid_dt'
best_CV_score = grid_dt.best_score_
print('Best CV accuracy:{:.3}'.format(best_CV_score))

In [None]:
# Extract best model from 'grid_dt'
best_model = grid_dt.best_estimator_
# Evaluate test set accuracy
test_acc = best_model.score(X_test,y_test)
# Print test set accuracy
print("Test set accuracy of best model: {:.3f}".format(test_acc))

## ADABoost

In [None]:
# Import models and utility functions
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

#Instantiate a classification-tree 'dt'
dt = DecisionTreeClassifier(max_depth=1, random_state=SEED)

# Instantiate an AdaBoost classifier 'adab_clf'
adb_clf = AdaBoostClassifier(base_estimator=dt, n_estimators=100)

# Fit 'adb_clf' to the training set
adb_clf.fit(X_train, y_train)

# Predict the test set probabilities of positive class
y_pred_proba = adb_clf.predict_proba(X_test)[:,1]

# Evaluate test-set roc_auc_score
adb_clf_roc_auc_score = roc_auc_score(y_test, y_pred_proba)

# Print adb_clf_roc_auc_score
print('ROC AUC score: {:.2f}'.format(adb_clf_roc_auc_score))

## 