In [80]:
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime

In [81]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [82]:
print(train.columns)
target = train["price"]
text_columns = train.select_dtypes(include=['object']).columns
date_columns = train.select_dtypes(include=['datetime64']).columns
train = train.drop(text_columns,axis=1)
train = train.drop(date_columns, axis=1)
train = train.drop(["scrape_id", "host_id"],axis=1)

test = test.drop(text_columns,axis=1)
test = test.drop(date_columns, axis=1)
test = test.drop(["scrape_id", "host_id"],axis=1)

Index(['id', 'scrape_id', 'last_scraped', 'name', 'description', 'picture_url',
       'host_id', 'host_name', 'host_since', 'host_is_superhost',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms_text', 'beds', 'amenities', 'price', 'minimum_nights',
       'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'calendar_last_scraped', 'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listi

In [83]:
train.columns

Index(['id', 'host_listings_count', 'host_total_listings_count', 'latitude',
       'longitude', 'accommodates', 'beds', 'price', 'minimum_nights',
       'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30',
       'availability_60', 'availability_90', 'availability_365',
       'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms'],
      dtype='object')

In [84]:
def train_evaluate_xgboost(data, test_data, target_column='price', random_state=42):
    """
    Train and evaluate a tuned XGBoost model for a categorical prediction problem using cross-validation.

    Parameters:
    - data: pandas DataFrame, the input dataset
    - target_column: str, the column to predict
    - random_state: int, seed used by the random number generator

    Returns:
    - cv_results: dict, the results of cross-validation
    """

    # Step 1: Data Preparation
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    # Encode categorical variables using one-hot encoding
    categorical_columns = X.select_dtypes(include=['object']).columns
    X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

    # Step 2: XGBoost Model Training with Parameter Tuning (Grid Search)
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(data[target_column].unique()), random_state=random_state)

    grid_search = GridSearchCV(xgb_model, param_grid, scoring='accuracy', cv=3, n_jobs=-1)
    cv_results = grid_search.fit(X, y)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters: ", cv_results.best_params_)
    print("Best Accuracy: {:.2f}".format(cv_results.best_score_))

    # Use the best parameters for the final model
    final_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(train[target_column].unique()), random_state=random_state, **grid_search.best_params_)

    # Train the final XGBoost model
    final_model.fit(X, y)

    # Step 3: Data Preparation for Testing
    X_test = test_data.copy()  # Copy the external test set
    X_test = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)

    # Step 4: Make Predictions on the External Test Set
    predictions = final_model.predict(X_test)

    return predictions


In [85]:
# Example usage:
# Assuming your dataset is in a variable called 'your_data'
processed_data = train.dropna()
processed_test = test.dropna()
predictions = train_evaluate_xgboost(processed_data, test)

Best Parameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 7, 'subsample': 0.8}
Best Accuracy: 0.48


In [87]:
# Create a DataFrame with IDs and Predictions
output_df = pd.DataFrame({'id': test['id'], 'price': predictions})

# Save the DataFrame to a CSV file
output_df.to_csv('predictions.csv', index=False)