In [18]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from nltk.tokenize import RegexpTokenizer
from preprocess import preprocess
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Load the training data from a CSV file located in the "data" directory
train_data = pd.read_csv(os.path.join("data", "train.csv"))

# Load the test data from a CSV file located in the "data" directory
test_data = pd.read_csv(os.path.join("data", "test.csv"))

# Fill missing values in the "description" column of the training data with empty strings
train_data["description"].fillna("", inplace=True)

# Fill missing values in the "host_is_superhost" column of the training data with "f" (assumed to mean "false")
train_data["host_is_superhost"].fillna("f", inplace=True)

# Fill missing values in the "beds" column of the training data with "1" (assumption)
train_data["beds"].fillna("1", inplace=True)

# Remove rows with missing values in any column from the training data
train_data.dropna(inplace=True)

# Separate the features (X) and the target variable (y) from the training data
X, y = train_data.drop(["price"], axis=1), train_data["price"]

# Convert the target variable (y) to integer type
y = y.astype(int)

# Split the training data into training and validation sets
# - 80% for training, 20% for validation
# - Use a random seed (random_state=42) for reproducibility
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# Assign the test data to the variable X_test for further processing
X_test = test_data

# Apply a preprocessing function (preprocess) to the training, validation, and test data
X_train, X_val, X_test = preprocess(X_train), preprocess(X_val), preprocess(X_test)



In [21]:
X_train["picture_url"]

KeyError: 'picture_url'

In [8]:
# Initialize a RobustScaler, which is used for robust feature scaling
scaler = RobustScaler()

# Fit the scaler to the training data, computing the necessary statistics
scaler.fit(X_train)

# Transform (scale) the training data using the fitted scaler
X_train = scaler.transform(X_train)

# Transform (scale) the validation data using the same scaler to ensure consistent scaling
X_val = scaler.transform(X_val)


Results may deviate from what I reported since I did a final cross validation run over some additional variables. 

In [15]:
# Import the RandomForestClassifier from scikit-learn
from sklearn.ensemble import RandomForestClassifier

# Initialize a RandomForestClassifier
rf = RandomForestClassifier()

# Define a dictionary of hyperparameters for tuning the RandomForestClassifier
hyperparameters = {
    'n_estimators': [100, 200, 300, 400, 500],  # Number of trees in the forest
    # 'class_weight': [None, 'balanced', 'balanced_subsample'],  # Class weights (None means equal weights)
    'max_depth': [20, 30, 40, 50],  # Maximum depth of the trees
    'min_samples_split': [5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be a leaf node
    'max_features': ['log2', 'sqrt'],  # Number of features to consider when splitting
    'criterion': ["gini", "entropy", "log_loss"],  # Split criterion for decision trees
}

# Initialize a GridSearchCV object, which performs hyperparameter tuning
# - rf: the base classifier (RandomForestClassifier)
# - hyperparameters: the hyperparameters to search over
# - cv=5: 5-fold cross-validation for evaluating hyperparameter combinations
# - verbose=3: display verbose output
# - n_jobs=20: use 20 parallel jobs for grid search (if available)
# - scoring="accuracy": use accuracy as the scoring metric
clf = GridSearchCV(rf, hyperparameters, cv=5, verbose=3, n_jobs=20, scoring="accuracy")

# Perform grid search to find the best hyperparameters for the RandomForestClassifier
best_rf = clf.fit(X_train, y_train)


Fitting 5 folds for each of 720 candidates, totalling 3600 fits
[CV 5/5] END criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.541 total time=   1.2s
[CV 4/5] END criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.528 total time=   1.3s
[CV 1/5] END criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.528 total time=   1.3s
[CV 3/5] END criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.537 total time=   1.5s
[CV 2/5] END criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.524 total time=   1.5s
[CV 3/5] END criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.535 total time=   2.4s
[CV 2/5] END crite

In [16]:
# Import the pprint function from the pprint module
from pprint import pprint

# Pretty-print the best hyperparameters of the tuned RandomForestClassifier
pprint(best_rf.best_estimator_.get_params())

# Make predictions on the validation data using the best tuned RandomForestClassifier
y_pred = best_rf.predict(X_val).astype(int)

# Print a classification report to evaluate the model's performance
# - The classification report includes various classification metrics
#   such as precision, recall, F1-score, and support for each class
# - It compares the predicted values (y_pred) to the actual values (y_val)
print(classification_report(y_val.astype(int), y_pred, digits=5))


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 400,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
              precision    recall  f1-score   support

           0    0.68227   0.85411   0.75858       802
           1    0.44106   0.40208   0.42067       577
           2    0.42540   0.25427   0.31829       527
           3    0.43622   0.62182   0.51274       550
           4    0.59091   0.40838   0.48297       382
           5    0.84536   0.65863   0.74041       249

    accuracy                        0.55491      3087
   macro avg    0.57020   0.53322   0.53894      3087
weighted avg    0.55135   0.55491   0.54089      3087



In [12]:
X_test_input = X_test.to_numpy().astype(float)
X_test_input = scaler.transform(X_test_input)
test_predictions = best_rf.predict(X_test_input)

submission = {
    "id" : list(range(len(test_predictions))), 
    "price" : list(test_predictions.astype(float))
}

submission = pd.DataFrame.from_dict(submission)

submission.to_csv(os.path.join("submissions/", "bahng_rf_final.csv"), index=False)