In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
import pandas as pd
import numpy as np
from scipy.stats import t

In [4]:
# Fetch data
df = pd.read_csv("preprocessed_data.csv")
df.head()

Unnamed: 0,monthly_rent,monthly_aconto,housing_type,size_sqm,rooms,floor,furnished,roommate_friendly,pets_allowed,elevator,...,dryer,deposit,prepaid_rent,energy_mark,area,availability_in,days_on_website,total_monthly_rent,student_affordable,months_on_website
0,12850.0,1350.0,Lejlighed,86.0,3,3,Ja,Nej,Nej,Nej,...,Ikke angivet,38550.0,0.0,D,Frederiksberg C,<1 month,10.0,14200.0,False,<1 month
1,15700.0,1224.0,Lejlighed,113.0,4,4,Nej,Nej,Ja,Ja,...,Ja,47100.0,15700.0,A15,Bagsværd,1-3 months,10.0,16924.0,False,<1 month
2,5000.0,1000.0,Værelse,9.0,1,0,Ja,Ja,Nej,Nej,...,Ja,15000.0,0.0,A20,Kastrup,1-3 months,10.0,6000.0,True,<1 month
3,3500.0,500.0,Værelse,14.0,1,0,Ja,Ja,Nej,Nej,...,Ja,4000.0,3500.0,C,København S,1-3 months,10.0,4000.0,True,<1 month
4,4650.0,0.0,Værelse,7.0,1,4,Ja,Nej,Nej,Ja,...,Ikke angivet,9300.0,0.0,none,København V,<1 month,10.0,4650.0,True,<1 month


In [5]:
# Splitting the data into features and target for classification
X = df.drop(columns=['student_affordable']).copy()
y = df['student_affordable'].copy()

In [6]:
y = y.apply(lambda x: 'Affordable' if x else 'Non affordable')

In [7]:
# Remove columns
cols_to_remove = ['monthly_rent', 'total_monthly_rent', 'monthly_aconto', 'prepaid_rent', 'deposit']
X.drop(cols_to_remove, axis=1, inplace=True, errors='ignore')
print(df.select_dtypes(include=['float64']).columns.to_list())
print(X.columns.to_list())

['monthly_rent', 'monthly_aconto', 'size_sqm', 'deposit', 'prepaid_rent', 'days_on_website', 'total_monthly_rent']
['housing_type', 'size_sqm', 'rooms', 'floor', 'furnished', 'roommate_friendly', 'pets_allowed', 'elevator', 'senior_friendly', 'students_only', 'balcony_terrace', 'parking', 'dishwasher', 'washing_machine', 'charging_station', 'dryer', 'energy_mark', 'area', 'availability_in', 'days_on_website', 'months_on_website']


In [8]:
# Transformation of predictors

# Standardization
cols_to_standardize = df.select_dtypes(include=['float64']).columns.to_list()
cols_to_standardize = [col for col in cols_to_standardize if col not in cols_to_remove]
scaler = StandardScaler()
X[cols_to_standardize] = scaler.fit_transform(X[cols_to_standardize])

# Make the categorical variables into dummies
X = pd.get_dummies(X)

In [9]:
# Split to train and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, stratify=y)

In [10]:
print(y.value_counts())

student_affordable
Non affordable    1596
Affordable         225
Name: count, dtype: int64


In [11]:
# Logistic regression with l2 regularization

lmbda = 0.01
lr_model = LogisticRegression(penalty='l2', C=(1/lmbda), max_iter=1000).fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test,y_pred)
print("Logistic regression accuracy: {}".format(lr_accuracy))

print(confusion_matrix(y_test, y_pred))

Logistic regression accuracy: 0.9232876712328767
[[ 29  16]
 [ 12 308]]


In [12]:
# Baseline model
class BaselineModel:
    def __init__(self, prediction_value = None):
        self.prediction_value = prediction_value
        
    def fit(self, X, y):
        self.prediction_value = y.value_counts().idxmax()
        return self
    
    def predict(self, X):
        return pd.Series(self.prediction_value, index=X.index)
    
    def get_params(self, deep=True):
        # Return parameters as a dictionary
        return {"prediction_value": self.prediction_value}
    
    def set_params(self, **params):
        # Set parameters from a dictionary
        for key, value in params.items():
            setattr(self, key, value)
        return self
    
baseline_model = BaselineModel().fit(X_train, y_train)
y_pred = baseline_model.predict(X_test)
baseline_accuracy = accuracy_score(y_test,y_pred)
print("Baseline accuracy: {}".format(baseline_accuracy))

Baseline accuracy: 0.8767123287671232


In [13]:
# Multi-layer perceptron
lmbda = 64
mlp_model = MLPClassifier(hidden_layer_sizes=(lmbda,), max_iter=10000).fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
mlp_accuracy = accuracy_score(y_test,y_pred)
print("Multi-layer perceptron accuracy: {}".format(mlp_accuracy))

print(confusion_matrix(y_test, y_pred))

Multi-layer perceptron accuracy: 0.9287671232876712
[[ 30  15]
 [ 11 309]]


In [14]:
# Two level cross validation (setup)
n = 10
outer_fold = KFold(n_splits=n, shuffle=True)
inner_fold = KFold(n_splits=n, shuffle=True)

classifiers = [
    LogisticRegression(penalty='l2', max_iter=1000),
    BaselineModel(),
    MLPClassifier(max_iter=500, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10)
]

params = {
    classifiers[0].__class__.__name__: {"C": [0.1, 1, 10, 100, 1000, 10000]},
    classifiers[1].__class__.__name__: {},
    classifiers[2].__class__.__name__: {"hidden_layer_sizes": [(8,), (16,), (32,), (64,)]}
}

def calculate_error(y_true: pd.Series, y_pred: pd.Series):
    # Calculate the number of misclassified samples
    n_misclassified = np.sum(y_true != y_pred)
    # Calculate the proportion of misclassified samples
    error_rate = n_misclassified / len(y_true)
    return error_rate

error_scorer = make_scorer(calculate_error, greater_is_better=False)

predictions = {key:[] for key in params.keys()}
test_errors = {key:[] for key in params.keys()}
targets = []

In [15]:
for train_idx, test_idx in outer_fold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    targets.append(y_test)

    for classifier in classifiers:
        # Nested CV with parameter optimization
        clf = GridSearchCV(
            estimator=classifier, 
            param_grid=params[classifier.__class__.__name__], 
            cv=inner_fold, 
            scoring=error_scorer
        )
        
        clf.fit(X_train, y_train)
        best_estimator = clf.best_estimator_
        y_pred = best_estimator.predict(X_test)
        error = calculate_error(y_test, y_pred)
        
        predictions[classifier.__class__.__name__].append(y_pred)
        test_errors[classifier.__class__.__name__].append((clf.best_params_, error))

print(test_errors)

{'LogisticRegression': [({'C': 1}, 0.060109289617486336), ({'C': 1}, 0.04945054945054945), ({'C': 1}, 0.07142857142857142), ({'C': 1}, 0.054945054945054944), ({'C': 1}, 0.04395604395604396), ({'C': 1}, 0.04945054945054945), ({'C': 0.1}, 0.038461538461538464), ({'C': 0.1}, 0.04945054945054945), ({'C': 1}, 0.054945054945054944), ({'C': 1}, 0.06043956043956044)], 'BaselineModel': [({}, 0.16939890710382513), ({}, 0.12087912087912088), ({}, 0.19230769230769232), ({}, 0.10989010989010989), ({}, 0.07142857142857142), ({}, 0.13736263736263737), ({}, 0.07142857142857142), ({}, 0.12637362637362637), ({}, 0.13186813186813187), ({}, 0.1043956043956044)], 'MLPClassifier': [({'hidden_layer_sizes': (32,)}, 0.08743169398907104), ({'hidden_layer_sizes': (64,)}, 0.06043956043956044), ({'hidden_layer_sizes': (64,)}, 0.07142857142857142), ({'hidden_layer_sizes': (64,)}, 0.054945054945054944), ({'hidden_layer_sizes': (64,)}, 0.04395604395604396), ({'hidden_layer_sizes': (64,)}, 0.06593406593406594), ({'hid

In [16]:
# Visualise test_errors in dataframe

errors_df = pd.DataFrame(columns=["Outer fold", "MLP_hidden_units", "MLP_test_error", "LR_lambda", "LR_test_error", "Baseline_test_error"])
errors_df["Outer fold"] = range(10)
errors_df[["MLP_hidden_units", 'MLP_test_error']] = \
    [(item[0]['hidden_layer_sizes'][0], item[1]) for item in test_errors['MLPClassifier']]
errors_df[["LR_lambda", 'LR_test_error']] = \
    [(1 / item[0]['C'], item[1]) for item in test_errors['LogisticRegression']]
errors_df['Baseline_test_error'] = \
    [item[1] for item in test_errors['BaselineModel']]

errors_df.astype({'MLP_hidden_units': 'int32'})

Unnamed: 0,Outer fold,MLP_hidden_units,MLP_test_error,LR_lambda,LR_test_error,Baseline_test_error
0,0,32,0.087432,1.0,0.060109,0.169399
1,1,64,0.06044,1.0,0.049451,0.120879
2,2,64,0.071429,1.0,0.071429,0.192308
3,3,64,0.054945,1.0,0.054945,0.10989
4,4,64,0.043956,1.0,0.043956,0.071429
5,5,64,0.065934,1.0,0.049451,0.137363
6,6,16,0.032967,10.0,0.038462,0.071429
7,7,64,0.043956,10.0,0.049451,0.126374
8,8,32,0.082418,1.0,0.054945,0.131868
9,9,64,0.049451,1.0,0.06044,0.104396


In [17]:
# Statistical evaluation of two methods
def statistical_evaluation(e1, e2, alpha = 0.05):
    r = e1 - e2
    n = len(r)
    
    t_critical = t.ppf( (1-alpha)/2, df = n-1)
    error_margin = t_critical * (np.std(r) / np.sqrt(n))
    
    ci_lower = np.mean(r) - error_margin
    ci_upper = np.mean(r) + error_margin
    
    # Calculate t-statistic for the test
    t_stat = np.mean(r) / (np.std(r, ddof=1) / np.sqrt(n))
    
    # Calculate the p-value (two-tailed)
    p_value = 2 * (1 - t.cdf(abs(t_stat), df = n - 1))
    
    print(f"{(1 - alpha) * 100}% confidence interval for the difference in generalization errors: [{ci_lower}, {ci_upper}] with p-value : {p_value}")


In [19]:
np.std(errors_df['MLP_test_error']-errors_df['LR_test_error'])

0.01308194994206264

In [20]:
t.ppf( (1-0.05)/2, df = n-1)

-0.06447679010158362

In [18]:
statistical_evaluation(errors_df['MLP_test_error'], errors_df['LR_test_error'])
statistical_evaluation(errors_df['MLP_test_error'], errors_df['Baseline_test_error'])
statistical_evaluation(errors_df['LR_test_error'], errors_df['Baseline_test_error'])

95.0% confidence interval for the difference in generalization errors: [0.006295676206837871, 0.005762211260885664] with p-value : 0.20013316179470464
95.0% confidence interval for the difference in generalization errors: [-0.0637278260541341, -0.06475352865672881] with p-value : 3.119722478395204e-05
95.0% confidence interval for the difference in generalization errors: [-0.06967046067549203, -0.0708687815030944] with p-value : 5.229492524438939e-05
