In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/pablomdata/ml-bespoke/main/data/apartments.csv")

In [3]:
df.head()

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms,district
0,5897,1953,25,3,1,Srodmiescie
1,1818,1992,143,9,5,Bielany
2,3643,1937,56,1,2,Praga
3,3517,1995,93,7,3,Ochota
4,3013,1992,144,6,5,Mokotow


## Exercise
- Create a model that predicts `district` from the other attributes.
- Try with: logistic regression, decision trees, k-nearest neighbours and random forest.

In [7]:
X = df.loc[:,df.columns[:-1]]
y = df.district

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

pipe = make_pipeline(StandardScaler(), LogisticRegression())

pipe.fit(X_train,y_train)
pipe.score(X_test,y_test)

0.31

In [13]:
from sklearn.metrics import classification_report

y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Bemowo       0.17      0.41      0.24        17
     Bielany       0.00      0.00      0.00        17
     Mokotow       0.21      0.24      0.22        21
      Ochota       0.27      0.32      0.29        19
       Praga       0.00      0.00      0.00        19
 Srodmiescie       1.00      1.00      1.00        27
       Ursus       0.25      0.44      0.32        16
     Ursynow       0.33      0.04      0.07        25
        Wola       0.18      0.38      0.24        16
    Zoliborz       0.18      0.13      0.15        23

    accuracy                           0.31       200
   macro avg       0.26      0.29      0.25       200
weighted avg       0.29      0.31      0.28       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

clfs = [
    DecisionTreeClassifier(max_depth=4),
    DecisionTreeClassifier(max_depth=8),
    RandomForestClassifier(n_estimators=20),
    RandomForestClassifier(n_estimators=50)
]

best_score = 0
best_clf = None
score = 0

for clf in clfs:
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    if score > best_score:
        best_score = score
        best_clf = clf
        
print(best_clf)
print(best_score)

RandomForestClassifier(n_estimators=50)
0.335


In [15]:
params = [50, 60, 70, 80]

best_score = 0
best_clf = None
score = 0

for param in params:
    clf = RandomForestClassifier(n_estimators=param)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    if score > best_score:
        best_score = score
        best_clf = clf
        
print(best_clf)
print(best_score)

RandomForestClassifier(n_estimators=60)
0.35


In [19]:
best_clf.feature_importances_

array([0.35490146, 0.22528655, 0.22078448, 0.13017099, 0.06885653])

In [17]:
best_clf.feature_names_in_

array(['m2.price', 'construction.year', 'surface', 'floor', 'no.rooms'],
      dtype=object)