In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Load the data
df = pd.read_csv('houseRent.csv')

# Split the data into features (X) and target variable (y)
X = df[['type', 'state', 'baths', 'beds', 'sqfeet']]
y = df['price']

# One-hot encode categorical features
ohe = OneHotEncoder()
X_cat = ohe.fit_transform(X[['type', 'state']])
X_cat = pd.DataFrame(X_cat.toarray(), columns=ohe.get_feature_names_out(['type', 'state']))

# Scale numerical features
scaler = StandardScaler()
X_num = scaler.fit_transform(X[['baths', 'beds', 'sqfeet']])
X_num = pd.DataFrame(X_num, columns=['baths', 'beds', 'sqfeet'])

# Combine the categorical and numerical features
X = pd.concat([X_cat, X_num], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fine-tune the model using Grid Search
param_grid = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score from the grid search
print('Best Parameters:', grid_search.best_params_)
print('Best Score:', grid_search.best_score_)

# Fit the model with the best hyperparameters
dt = DecisionTreeRegressor(**grid_search.best_params_, random_state=42)
dt.fit(X_train, y_train)

# Make predictions on the test set
dt_pred = dt.predict(X_test)

# Evaluate the model using Mean Squared Error
dt_mse = mean_squared_error(y_test, dt_pred)
print('Decision Tree MSE:', dt_mse)

# Make predictions on new data
new_data = pd.DataFrame({
    'type': ['house'],
    'state': ['ca'],
    'baths': [1],
    'beds': [2],
    'sqfeet': [1000]
})
new_data_cat = ohe.transform(new_data[['type', 'state']])
new_data_cat = pd.DataFrame(new_data_cat.toarray(), columns=ohe.get_feature_names_out(['type', 'state']))
new_data_num = new_data[['baths', 'beds', 'sqfeet']]
new_data_num = scaler.transform(new_data_num)
new_data_num = pd.DataFrame(new_data_num, columns=['baths', 'beds', 'sqfeet'])
new_data = pd.concat([new_data_cat, new_data_num], axis=1)
prediction = dt.predict(new_data)
print('Predicted rent:', prediction)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best Score: -355.0500707523465
Decision Tree MSE: 2466925767.097344
Predicted rent: [2465.46692547]


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Decision tree accuracy:", accuracy)

Decision tree accuracy: 0.4295583893843017
