In [2]:
def mode(df):
    m = df.value_counts().head(1).index
    return m
    
def read_data():
    df = pd.read_csv("data full clean.csv")
    df_pivot = df.pivot_table(index = "area",
                                values = ["rent","rooms","year"],
                                aggfunc = {"rent":["mean","min","max",mode],"rooms":"count","year":["min","max"]})
    # Read the shapefile-formatted (.shp) file
    gdf = gpd.read_file('bezirksgrenzen.shp')
    list1 = gdf.index.values.tolist()
    list2 = ["Reinickendorf","Charlottenburg-Wilmersdorf",
            "Treptow-Köpenick","Pankow","Neukölln","Lichtenberg",
            "Marzahn-Hellersdorf","Spandau","Steglitz-Zehlendorf",
            "Mitte","Friedrichshain-Kreuzberg","Tempelhof-Schöneberg"]
    gdf = gdf.rename(index=dict(zip(list1,list2)))
    df1 = pd.concat([gdf,df_pivot], axis = 1)
    cols = ["geometry","max_rent","mean_rent","min_rent","mode_rent","available_apartments","newest_building","oldest_building"]
    df1.columns = cols
    cols = ["max_rent","mean_rent","min_rent","mode_rent"]
    for i in cols:
        rent = []
        for j in df1[i]:
            rent.append(round(j,0))
        df1[i] = rent
    return df,df1

In [3]:
def run_pipeline_regr(X_train, X_test, y_train, y_test):
    linreg = LinearRegression()
    dtreg = DecisionTreeRegressor()
    knreg = KNeighborsRegressor()
    mlp = MLPRegressor(max_iter = 700)
    rfreg = RandomForestRegressor()

    model_pipeline = [linreg, dtreg, knreg, mlp, rfreg]
    model_name = ["linreg","decision_tree_reg","knn_reg","mlp","random_forest_reg"]
    scores_train = {}
    scores_test = {}
    scores_new = {}
    preds_train = {}
    preds_test = {}
    preds_new = {}
    for i,j in zip(model_pipeline, model_name):
        i.fit(X_train, y_train)
        pred_train = i.predict(X_train)
        preds_train[j] = pred_train
        pred_test = i.predict(X_test)
        preds_test[j] = pred_test
        mean_score_train = np.mean(cross_val_score(i, X_train, y_train, cv = 5))
        scores_train[j] = mean_score_train
        mean_score_test = np.mean(cross_val_score(i, X_test, y_test, cv = 5))
        scores_test[j] = mean_score_test

    return preds_train, preds_test, scores_train, scores_test

In [13]:
def grid_search(model,X_train,y_train):
    from sklearn.model_selection import GridSearchCV
    if model == "dtreg":
        from sklearn.tree import DecisionTreeRegressor
        max_depth_choices= [3,4,5,6,7,None]
        criterion_choices = ['squared_error','absolute_error']
        min_samples_split_choices = [2,3,4,5,6,7]
        min_samples_leaf_choices = [2,3,4,5,6,7]
        max_features_choices = [2,3,4, None]
        model = DecisionTreeRegressor()
        grid = {'max_depth': max_depth_choices,
        'criterion': criterion_choices,
        'min_samples_split': min_samples_split_choices,
        'min_samples_leaf': min_samples_leaf_choices}
    elif model == "knreg":
        from sklearn.neighbors import KNeighborsRegressor
        neighbor = [3,4,5,6,7]
        weight = ["uniform","distance"]
        alg = ["auto"]
        model = KNeighborsRegressor()
        grid = {"n_neighbors": neighbor,
        "weights": weight,
        "algorithm": alg}
    elif model == "mlp":
        from sklearn.neural_network import MLPRegressor
        maxiter = [200, 300, 400, 500, 600, 700]
        model = MLPRegressor()
        grid = {"max_iter": maxiter}
    elif model == "rfreg":
        from sklearn.ensemble import RandomForestRegressor
        minsamplesplit = [2,3,4,5,6,7]
        minsamplesleaf = [2,3,4,5,6,7]
        maxdepth = [3,4,5,6,7,None]
        crit = ['squared_error','absolute_error']
        model = RandomForestRegressor()
        grid = {"max_depth":maxdepth, "min_samples_split":minsamplesplit,
        "min_samples_leaf":minsamplesleaf, "criterion":crit}
    grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5)
    grid_search.fit(X_train, y_train)
    print(model)
    print(grid_search.best_params_)
    print(grid_search.best_score_)


In [9]:
def preprocessing(X,y):
    display(X.shape)
    df_corr = pd.concat([X,y], axis = 1).corr()
    cols_to_drop = df_corr[df_corr["rent"] < 0.8].index.tolist()
    while len(cols_to_drop) != 0:
        X = X.drop(columns = cols_to_drop)
        df_corr = pd.concat([X,y], axis = 1).corr()
        cols_to_drop = df_corr[df_corr["rent"] < 0.8].index.tolist()
        display(X.shape)
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 43)
    num_train = X_train.select_dtypes(np.number)
    num_test = X_test.select_dtypes(np.number)
    cat_train = X_train.select_dtypes(object)
    cat_test = X_test.select_dtypes(object)

    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler().fit(num_train)
    cols = scaler.get_feature_names_out(input_features = num_train.columns)
    num_train_scaled = scaler.transform(num_train)
    num_test_scaled = scaler.transform(num_test)
    num_train_scaled = pd.DataFrame(num_train_scaled, columns = cols)
    num_test_scaled = pd.DataFrame(num_test_scaled, columns = cols)

    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder(handle_unknown = "ignore").fit(cat_train)
    cat_train = encoder.transform(cat_train).toarray()
    cat_test = encoder.transform(cat_test).toarray()
    cat_train_encoded = pd.DataFrame(cat_train)
    cat_test_encoded = pd.DataFrame(cat_test)
    cat_train_encoded = cat_train_encoded.reset_index(drop=True)
    cat_test_encoded = cat_test_encoded.reset_index(drop=True)
    X_train_processed = pd.concat([num_train_scaled, cat_train_encoded], axis=1)
    X_test_processed = pd.concat([num_test_scaled, cat_test_encoded], axis=1)
    return X_train_processed, X_test_processed, y_train, y_test
    

In [1]:
import streamlit as st
import time
import pandas as pd
import geopandas as gpd
import numpy as np

In [5]:
pd.set_option('display.max_columns', None)

In [10]:
df,df1 = read_data()
df01 = df.iloc[:,:9]
df02 = df.iloc[:,9:-1]
df03 = df[["year"]]
df01 = df01.drop(columns = ["link","addresse"])
df02num = df02.select_dtypes(np.number)
df02obj = df02.select_dtypes(object)

for col in df02obj.columns:
    l = []
    for i in df02obj[col]:
        if i == "0":
            l.append("no info")
        else:
            l.append(i)
    df02obj[col] = l
data = pd.concat([df01,df02num,df02obj,df03], axis = 1)
col = ["pets_allowed","person_elevator"]
for i in col:
    data[i] = data[i].astype("int64")
X = data.drop(columns = ["rent"])
y = data["rent"]

In [11]:
X_train_processed, X_test_processed, y_train, y_test = preprocessing (X,y)

(407, 37)

(407, 13)

In [14]:
model = ["dtreg", "knreg", "mlp", "rfreg"]
for i in model:
    grid_search(i, X_train_processed, y_train)

DecisionTreeRegressor()
{'criterion': 'absolute_error', 'max_depth': 5, 'min_samples_leaf': 7, 'min_samples_split': 4}
0.6966047456819042
KNeighborsRegressor()
{'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'distance'}
0.5219697686256035
MLPRegressor()
{'max_iter': 700}
0.12155517581916098
RandomForestRegressor()
{'criterion': 'absolute_error', 'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 3}
0.7590302129948594


In [15]:
model = ["dtreg", "knreg", "mlp", "rfreg"]
for i in model:
    grid_search(i, X_test_processed, y_test)

DecisionTreeRegressor()
{'criterion': 'absolute_error', 'max_depth': 4, 'min_samples_leaf': 6, 'min_samples_split': 2}
0.6826295769465567
KNeighborsRegressor()
{'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'distance'}
0.004228078545188252
MLPRegressor()
{'max_iter': 700}
-1.5523642164283682
RandomForestRegressor()
{'criterion': 'absolute_error', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 7}
0.6680000473539521


In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

regr = RandomForestRegressor(criterion= 'absolute_error',
                            max_depth= None,
                            min_samples_leaf= 4,
                            min_samples_split= 7)
regr.fit(X_train_processed, y_train)
pred_test = regr.predict(X_test_processed)
mean_score_train = np.mean(cross_val_score(regr, X_train_processed, y_train, cv = 5))
mean_score_test = np.mean(cross_val_score(regr, X_test_processed, y_test, cv = 5))

In [27]:
import plotly.graph_objs as go


x = list(range(y_test.shape[0]))
y1 = y_test
y2 = pred_test

trace1 = go.Scatter(x=x,y=y1, mode = "lines+markers", name = "actual price")
trace2 = go.Scatter(x=x,y=y2, mode = "lines+markers", name = "predicted price")

layout = go.Layout(title = "rent prediction")
fig = go.Figure(data=[trace1,trace2], layout = layout)
fig.update_layout(template="plotly_dark")
fig.show()
# plt.figure(figsize=(16,8))
# plt.plot(K, silhouette, 'bx-')
# plt.xlabel('k')
# plt.ylabel('silhouette score')
# plt.xticks(np.arange(min(K), max(K)+1, 1.0))
# plt.title('Slhouette score showing the optimal k')