In [59]:
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from xgboost import XGBRegressor


In [60]:
train = pd.read_csv("cleaned_train.csv")
test = pd.read_csv("cleaned_test.csv")

In [61]:
def train_evaluate_xgboost(data, test_data, target_column='price', random_state=42):
    """
    Train and evaluate a tuned XGBoost model for a categorical prediction problem using cross-validation.

    Parameters:
    - data: pandas DataFrame, the input dataset
    - target_column: str, the column to predict
    - random_state: int, seed used by the random number generator

    Returns:
    - cv_results: dict, the results of cross-validation
    """

    # Step 1: Data Preparation
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    # Step 2: XGBoost Model Training with Parameter Tuning (Grid Search)
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(data[target_column].unique()), random_state=random_state)

    grid_search = GridSearchCV(xgb_model, param_grid, scoring='accuracy', cv=3, n_jobs=-1)
    cv_results = grid_search.fit(X, y)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters: ", cv_results.best_params_)
    print("Best Accuracy: {:.2f}".format(cv_results.best_score_))

    # Use the best parameters for the final model
    final_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(data[target_column].unique()), random_state=random_state, **grid_search.best_params_)

    # Train the final XGBoost model
    final_model.fit(X, y)

    # Step 3: Data Preparation for Testing
    X_test = test_data.copy()  # Copy the external test set
    
    # Step 4: Make Predictions on the External Test Set
    predictions = final_model.predict(X_test)

    return predictions


In [62]:
def train_evaluate_xgboost_reg(data, test_data, target_column='price', random_state=42):
    """
    Train and evaluate a tuned XGBoost model for a categorical prediction problem using cross-validation.

    Parameters:
    - data: pandas DataFrame, the input dataset
    - target_column: str, the column to predict
    - random_state: int, seed used by the random number generator

    Returns:
    - cv_results: dict, the results of cross-validation
    """

    # Step 1: Data Preparation
    X = data.drop(target_column, axis=1)
    y = data[target_column].astype(float)

    # Step 2: XGBoost Model Training with Parameter Tuning (Grid Search)
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=random_state)
    # Create a custom scoring function for rounded predictions
    def custom_accuracy(y_true, y_pred):
        rounded_preds = np.round(y_pred)
        return accuracy_score(y_true, rounded_preds)

    # Make the scorer function
    custom_accuracy_scorer = metrics.make_scorer(custom_accuracy, greater_is_better=True)
    grid_search = GridSearchCV(xgb_model, param_grid, scoring=custom_accuracy_scorer, cv=3, n_jobs=-1)
    cv_results = grid_search.fit(X, y)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters: ", cv_results.best_params_)
    print("Best Accuracy: {:.2f}".format(cv_results.best_score_))

    # Use the best parameters for the final model
    final_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=random_state, **grid_search.best_params_)

    # Train the final XGBoost model
    final_model.fit(X, y)

    # Step 3: Data Preparation for Testing
    X_test = test_data.copy()  # Copy the external test set
    
    # Step 4: Make Predictions on the External Test Set
    predictions = final_model.predict(X_test)

    return predictions


In [63]:
# Example usage:
# Assuming your dataset is in a variable called 'your_data'
predictions = train_evaluate_xgboost_reg(train, test)

Traceback (most recent call last):
  File "/Users/michellesi/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/michellesi/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/michellesi/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Users/michellesi/anaconda3/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 192, in wrapper
    return func(*args, **kwargs)
  File "/Users/michellesi/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 221, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/Users/michellesi/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 95, in _chec

Best Parameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'subsample': 0.8}
Best Accuracy: nan


In [64]:
# Create a DataFrame with IDs and Predictions
output_df = pd.DataFrame({'id': test['id'], 'price': predictions})

# Save the DataFrame to a CSV file
output_df.to_csv('predictions.csv', index=False)