In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [45]:
df = pd.read_csv('Cleaned_Dataset.csv')

In [46]:
def change_data_types(df):

    num_cols_to_convert = df.shape[1] - 5

    cols_to_convert = df.columns[:num_cols_to_convert]

    dtype_dict = {col: np.int8 for col in cols_to_convert}

    df[cols_to_convert] = df[cols_to_convert].astype(dtype_dict)
change_data_types(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499942 entries, 0 to 499941
Columns: 1875 entries, city_Abbot to price
dtypes: float64(5), int8(1870)
memory usage: 910.7 MB


### Binning

In [47]:
percentile_33 = df['price'].quantile(0.33)
percentile_67 = df['price'].quantile(0.67)

In [48]:
price_list = df['price'].tolist()

In [49]:
def create_bins(df, list):
    bins_list = []
    for i in list:
        if i <= percentile_33:
            bins_list.append("Low")
        elif i > percentile_33 and i <= percentile_67:
            bins_list.append("Middle")
        else:
            bins_list.append("High")
    df = pd.Series(bins_list)
    return df

### Test-Train split

In [50]:
features = df.drop(columns=['price'])
label = df["price"]

In [51]:
X_train , X_test , y_train , y_test = train_test_split(features, label, test_size=0.2, random_state=42)

### Logistic Regression

In [52]:
y_bins_test = create_bins(y_test, y_test.tolist())
y_bins_train = create_bins(y_train, y_train.tolist())

In [53]:
logistic_model = OneVsRestClassifier(LogisticRegression(solver='liblinear',  max_iter=10000))
logistic_model.fit(X_train, y_bins_train)

In [73]:
y_pred = logistic_model.predict(X_test)

In [78]:
#printing scores
print("Accuracy: ", accuracy_score(y_bins_test, y_pred))
print("Precision: ", precision_score(y_bins_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_bins_test, y_pred, average='macro'))
print("F1 Score: ", f1_score(y_bins_test, y_pred, average='macro'))
print("Confusion Matrix: \n", classification_report(y_bins_test, y_pred))

Accuracy:  0.6923661602776305
Precision:  0.6895305261267102
Recall:  0.693981009682877
F1 Score:  0.690353899977383
Confusion Matrix: 
               precision    recall  f1-score   support

        High       0.74      0.77      0.75     32917
         Low       0.70      0.76      0.73     32985
      Middle       0.63      0.55      0.59     34087

    accuracy                           0.69     99989
   macro avg       0.69      0.69      0.69     99989
weighted avg       0.69      0.69      0.69     99989



### Linear Regression

In [56]:
model = LinearRegression()

In [57]:
model.fit(X_train, y_train)

In [58]:
y_pred = model.predict(X_test)

In [59]:
r2 = r2_score(y_test, y_pred)

In [60]:
print(f"R-squared: {r2:.4f}")

R-squared: -3040162107041218048.0000


### Trees 

#### CART

In [61]:
cart_tree = DecisionTreeRegressor()  

In [62]:
cart_tree.fit(X_train, y_train)

In [63]:
y_pred = cart_tree.predict(X_test)

In [64]:
r2_trees = r2_score(y_test, y_pred)

In [65]:
print(f"R-squared: {r2_trees:.4f}")

R-squared: 0.5108


#### Random Forest

In [66]:
random_forest = RandomForestRegressor(n_estimators=10, random_state=42)
random_forest.fit(X_train, y_train)

In [67]:
y_pred = random_forest.predict(X_test)

In [68]:
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2:.4f}")

R-squared: 0.6943
