In [34]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [35]:
import holidays
country_codes = {'Canada':'CA','Finland':'FI','Italy':'IT','Kenya':'KE','Norway':'NO','Singapore':'SG'}
train['country'].unique()

array(['Canada', 'Finland', 'Italy', 'Kenya', 'Norway', 'Singapore'],
      dtype=object)

In [36]:
from sklearn.model_selection import train_test_split, GridSearchCV
train.dropna(inplace=True)
train.astype({'country':'category','store':'category','product':'category'})
train['date'] = pd.to_datetime(train['date'],format='%Y-%m-%d')

def is_holiday(row):
    country = row['country']
    date = row['date']
    country_holidays = holidays.country_holidays(country_codes[country])
    if country_holidays.get(date):
        return True
    return False
train['holiday'] = train.apply(is_holiday, axis=1)
train['weekday'] = train['date'].dt.weekday
train.astype({'holiday':'category','weekday':'category'})

X = train.loc[:,train.columns.difference(['id','num_sold','date'])]
y = train.loc[:,['num_sold']]
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2, random_state=42)
print('Train')
print(X_train.describe())
print(y_train.describe())
print('\nValidate')
print(X_val.describe())
print(y_val.describe())

Train
             weekday
count  177007.000000
mean        3.012146
std         2.001138
min         0.000000
25%         1.000000
50%         3.000000
75%         5.000000
max         6.000000
            num_sold
count  177007.000000
mean      751.724474
std       689.868011
min         5.000000
25%       219.000000
50%       604.000000
75%      1113.000000
max      5939.000000

Validate
            weekday
count  44252.000000
mean       3.011751
std        2.005472
min        0.000000
25%        1.000000
50%        3.000000
75%        5.000000
max        6.000000
           num_sold
count  44252.000000
mean     755.738995
std      691.352373
min        5.000000
25%      222.000000
50%      606.500000
75%     1120.000000
max     5455.000000


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error

models_with_params = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {
            "model__fit_intercept": [True, False]
        }
    },
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [None, 10, 20],
            "model__min_samples_split": [2, 5, 10],
            "model__min_samples_leaf": [1, 2, 4]
        }
    },
    "XGBoost": {
        "model": xgb.XGBRegressor(random_state=42, objective="reg:squarederror"),
        "params": {
            "model__n_estimators": [50, 100, 200, 300],
            "model__max_depth": [ 3, 6, 7, 8, 9, 10],
            "model__learning_rate": [.001, 0.01, 0.1, 0.2],
            "model__subsample": [.1, .4, 0.8, 1.0]
        }
    },
    "SVR": {
        "model": SVR(),
        "params": {
            "model__kernel": ["linear"],
            "model__C": [.001, 0.1, 1, 5, 10],
            "model__gamma": ["scale", "auto"]
        }
    }
}

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
# Preprocessing for categorical data
categorical_preprocessor = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_preprocessor, X.columns)
    ]
)

# Perform grid search and evaluate models on the validation set
validation_scores = {}
for name, config in models_with_params.items():
    print(f"Running GridSearchCV for {name}...")
    
    # Define the pipeline with preprocessing and model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # Apply preprocessing
        ('model', config['model'])       # Add the model
    ])
    
    # Perform grid search
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=config['params'],
        scoring='neg_mean_absolute_error',  # Use MAE as proxy for speed
        cv=5,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    
    # Use the best model to make predictions on the validation set
    best_model = grid.best_estimator_
    y_val_pred = best_model.predict(X_val)
    
    # Calculate MAPE on validation set
    mape_score = mean_absolute_percentage_error(y_val, y_val_pred)
    
    validation_scores[name] = {
        "best_params": grid.best_params_,
        "validation_mape": mape_score
    }

# Rank models based on validation MAPE
ranked_models = sorted(validation_scores.items(), key=lambda x: x[1]['validation_mape'])

# Print the results
print("\nRanked Models (based on validation MAPE):")
for rank, (name, result) in enumerate(ranked_models, 1):
    print(f"Rank {rank}: {name}")
    print(f"  Best Parameters: {result['best_params']}")
    print(f"  Validation MAPE: {result['validation_mape']:.4f}")

Running GridSearchCV for Linear Regression...
Running GridSearchCV for Random Forest...


KeyboardInterrupt: 