# Tutorial 2 - Logistic Regression


We will perform two prediction tasks:
1) Whether the price of an AIRBNB listing is greater than or equal to $150 (`price_gte_150` column),<br>
2) What is the price category, among 4 categories, of an AIRBNB listing (`price_category` column)

**The unit of analysis is an AIRBNB LISTING**

# Setup

In [None]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [None]:
#We will predict the "price_gte_150" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

# Split the data into train and test

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(airbnb, test_size=0.3)

### Be careful: we haven't seperated the target column yet

## Check the missing values

In [None]:
train_set.isna().sum()

In [None]:
test_set.isna().sum()

# Data Prep

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Drop the variables we can't use in this tutorial

In [None]:
# We can't use the following columns in this tutorial, because they are not for binary classification tasks

train = train_set.drop(['price', 'price_category'], axis=1)
test = test_set.drop(['price', 'price_category'], axis=1)

## Separate the target variable (we don't want to transform it)

In [None]:
train_y = train[['price_gte_150']]
test_y = test[['price_gte_150']]

train_inputs = train.drop(['price_gte_150'], axis=1)
test_inputs = test.drop(['price_gte_150'], axis=1)

##  Identify the numerical and categorical columns

In [None]:
train_inputs.dtypes

**At this stage, you can manually identify numeric, binary, and categorical columns as follows:**

`numeric_columns = ['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 'guests_included', 'price_per_extra_person', 'minimum_nights', 'number_of_reviews', 'number_days_btw_first_last_review', 'review_scores_rating']`
 
 `binary_columns = ['host_is_superhost', 'host_identity_verified']`
 
 `categorical_columns = ['neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']`
 
<br>
 
**If you do not want to manually type these, you can do the below tricks:**

In [None]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [None]:
# Identify the binary columns so we can pass them through without transforming
binary_columns = ['host_is_superhost', 'host_identity_verified']

In [None]:
# Be careful: numerical columns already includes the binary columns,
# So, we need to remove the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

In [None]:
binary_columns

In [None]:
numeric_columns

In [None]:
categorical_columns

# Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [None]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [None]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

In [None]:
train_x.shape

# Tranform: transform() for TEST

In [None]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

In [None]:
test_x.shape

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='lbfgs')
log_reg.fit(train_x, train_y)

## Accuracy

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#Train accuracy

train_y_pred = log_reg.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

In [None]:
#Test accuracy

test_y_pred = log_reg.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

# Baseline Accuracy

In [None]:
train_y.value_counts()

In [None]:
# Find the percentage values
train_y.value_counts()/len(train_y)

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

#Usually created on test set
confusion_matrix(test_y, test_y_pred)

# Precision

In [None]:
from sklearn.metrics import precision_score

#Usually created on test set
precision_score(test_y, test_y_pred)

# Recall

In [None]:
from sklearn.metrics import recall_score

#Usually created on test set
recall_score(test_y, test_y_pred)

# F1 score

In [None]:
from sklearn.metrics import f1_score

#Usually created on test set
f1_score(test_y, test_y_pred)

# Softmax Regression (Multi-class)

In [None]:
train_set[['price_category']].head(10)

In [None]:
# Assign new target variable
train_y_multiclass = train_set[['price_category']]
test_y_multiclass = test_set[['price_category']]

In [None]:
softmax_reg = LogisticRegression(multi_class='multinomial', solver = 'lbfgs', 
                                 C=100, max_iter=1000)

softmax_reg.fit(train_x, train_y_multiclass)

## Accuracy

In [None]:
#Train accuracy

train_y_pred = softmax_reg.predict(train_x)

train_acc = accuracy_score(train_y_multiclass, train_y_pred)

print('Train acc: {}' .format(train_acc))

In [None]:
#Test accuracy

test_y_pred = softmax_reg.predict(test_x)

test_acc = accuracy_score(test_y_multiclass, test_y_pred)

print('Test acc: {}' .format(test_acc))

## Confusion Matrix

In [None]:
confusion_matrix(test_y_multiclass, test_y_pred)

## Baseline

In [None]:
train_y_multiclass.value_counts()/len(train_y_multiclass)

## Predicting a single observation

In [None]:
#pick a random observation
rand_obs = test_x[1700:1701]
rand_obs

In [None]:
#let's see the observation's classification:
test_y_multiclass[1700:1701]

In [None]:
#Let's see the prediction:
softmax_reg.predict(rand_obs)