In [2]:
import pandas as pd

df = pd.read_csv('./data/train.csv')

In [3]:
# original simple model with more data-preprocessing
def handle_outliers(df, feature, method='iqr'):
    if method == 'iqr':
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]
    elif method == 'clip':
        upper_limit = df[feature].quantile(0.99)
        df[feature] = df[feature].clip(upper=upper_limit)
    return df

# Apply outlier removal
df = handle_outliers(df, 'accommodates')
df = handle_outliers(df, 'beds', method='clip')
df = handle_outliers(df, 'host_acceptance_rate', method='clip')

# Fill missing values
df['beds'] = df['beds'].fillna(df['beds'].median())
df['host_acceptance_rate'] = df['host_acceptance_rate'].fillna(df['host_acceptance_rate'].median())
df['bedrooms'] = df['bedrooms'].fillna(df['bedrooms'].median())
df['review_scores_location'] = df['review_scores_location'].fillna(3)

# Check cleaned data
print(df.isnull().sum())


name                                               0
description                                      371
property_type                                      0
price                                              0
neighbourhood_cleansed                             0
neighbourhood_group_cleansed                       0
latitude                                           0
longitude                                          0
host_since                                         0
host_response_time                              2142
host_response_rate                              2142
host_acceptance_rate                               0
host_is_superhost                                240
host_listings_count                                0
host_total_listings_count                          0
host_verifications                                 0
host_has_profile_pic                               0
host_identity_verified                             0
calculated_host_listings_count                

In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Define features to use
features = [
    'accommodates', 
    'beds', 
    'calculated_host_listings_count_entire_homes',
    'host_listings_count', 
    'host_total_listings_count', 
    'host_acceptance_rate', 
    'bedrooms', 
    'review_scores_location'
]

# Define features and target
X = df[features]
y = df['price']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train the XGBoost model
model = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss',  # Multi-class log loss
    random_state=42
)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_val, y_pred))


Parameters: { "use_label_encoder" } are not used.



Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.72      0.69       539
           1       0.40      0.52      0.45       533
           2       0.38      0.30      0.33       502
           3       0.39      0.38      0.39       547
           4       0.40      0.38      0.39       452
           5       0.70      0.61      0.65       467

    accuracy                           0.49      3040
   macro avg       0.49      0.48      0.48      3040
weighted avg       0.49      0.49      0.49      3040



In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import classification_report

# Define features to use
features = [
    'accommodates', 
    'beds', 
    'calculated_host_listings_count_entire_homes',
    'host_listings_count', 
    'host_total_listings_count', 
    'host_acceptance_rate', 
    'bedrooms', 
    'review_scores_location'
]

# Define features and target
X = df[features]
y = df['price']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train the XGBoost model
model = XGBRegressor(
    n_estimators=200,          # Increase for better performance
    learning_rate=0.05,        # Slower learning rate
    max_depth=6,               # Adjust depth to control overfitting
    subsample=0.8,             # Randomly sample rows for better generalization
    colsample_bytree=0.8,      # Randomly sample features for better generalization
    random_state=42
)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_val, y_pred))


Classification Report:


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets