In [None]:
!bash <(curl -sL https://gitlab.aicrowd.com/jyotish/pricing-game-notebook-scripts/raw/master/python/setup.sh)
from aicrowd_helpers import *

In [None]:
import sklearn

class Config:
    TRAINING_DATA_PATH = 'training.csv'
    MODEL_OUTPUT_PATH = 'model.pkl'
    AICROWD_API_KEY = 'YOUR API KEY'
    ADDITIONAL_PACKAGES = [
    'numpy',
    'pandas',
    'scikit-learn==' + sklearn.__version__, 
  ]

In [None]:
%download_aicrowd_dataset

In [None]:
%%track_imports

import numpy as np
import pandas as pd
import pickle
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
import importlib
import global_imports
importlib.reload(global_imports)
from global_imports import *

In [None]:
df = pd.read_csv(Config.TRAINING_DATA_PATH)
X_train = df.drop(columns = ['claim_amount'])
y_train = df['claim_amount']

In [None]:
X_train.sample(n = 4)

In [None]:
y_train.sample(n = 4)

In [None]:
%%aicrowd_include

class Bag:
    """A bag of models, outputs of which are averaged."""
    def __init__(self, models):
        self.models = models
        self.means = None

    def wrangle(self, df, means = None):
        """Prepare the predictors."""
        print("Wrangling data.")
        original_len = len(df)

        # Use pre-existing means if available (i.e. on test set)
        if means is None:
            means = df[df.vh_weight > 0].mean()

        # Replace 0 vehicle weight with mean
        df.vh_weight = df.vh_weight.replace(0.0, means.vh_weight)

        # Replace NaNs with column mean
        nans = ['vh_age', 'vh_speed', 'vh_value', 'vh_weight']
        df[nans] = df[nans].fillna(means[nans])

        print('Done with replacing')
        assert len(df) == original_len

        # Join first year data
        df = df.merge(df[df.year == 1.0][['id_policy', 'pol_no_claims_discount']].drop_duplicates(subset = 'id_policy'),
                      on = 'id_policy', suffixes = ('', '_first'), how = 'left')

        print("left join")
        print("original: {}, new: {}".format(original_len, len(df)))
        assert len(df) == original_len

        # Change from beginning discount level
        df['discount_base_change'] = df.pol_no_claims_discount - 0.631
        # Yearly discount change over licence ownership
        df['discount_yearly_change'] = df.discount_base_change / df.drv_age_lic1

        # Discount change from policy beginning
        df['discount_change'] = df.pol_no_claims_discount - df.pol_no_claims_discount_first
        # Approx. no. of claims since first year
        df['no_claims'] = np.maximum(np.zeros_like(df.year), np.ceil(df.discount_change / 0.2))

        # Driver 1 and 2 combined info
        df['drv_sex2'] = df.drv_sex2.replace('0', '')
        df['drv_sexes'] = df.apply(lambda row: ''.join(sorted(row.drv_sex1 + row.drv_sex2)), axis=1)
        df['drv_avg_age'] = np.mean(df[['drv_age1', 'drv_age2']], axis = 1)
        df['drv_avg_lic'] = np.mean(df[['drv_age_lic1', 'drv_age_lic2']], axis = 1)

        # Population density
        df['pop_dens'] = df.population / df.town_surface_area

        print('variable creation')
        assert len(df) == original_len

        # Drop unnecessary cols
        df = df.drop(columns = ['id_policy', 'drv_drv2', 'drv_sex2', 'drv_age2', 'drv_age_lic2',
                            'vh_make_model', 'pol_pay_freq', 'pol_no_claims_discount_first'])

        print('drop cols')
        assert len(df) == original_len


        # One-hot encoding for categorical variables
        cats = ['pol_coverage', 'pol_payd', 'pol_usage', 'drv_sex1', 'vh_fuel', 'vh_type',
              'drv_sexes']
        df = pd.get_dummies(df, prefix = cats, columns=cats)

        assert len(df) == original_len
        return df, means

    def fit(self, x, y):
        """Fit all individual models."""
        x, means = self.wrangle(x)
        self.means = means
        print("Fitting models.")
        for model in self.models:
            model.fit(x, y)

    def predict(self, x):
        """Predict on all individual models and average their results."""
        preds = []
        x, blah = self.wrangle(x, self.means)
        for model in self.models:
            preds.append(model.predict(x))
        return np.mean(preds, axis = 0)

In [None]:
import importlib
import utils
importlib.reload(utils)
from utils import *

In [None]:
def fit_model(X_raw, y_raw):
    """Model training function: given training data (X_raw, y_raw), train this pricing model.

    Parameters
    ----------
    X_raw : Pandas dataframe, with the columns described in the data dictionary.
        Each row is a different contract. This data has not been processed.
    y_raw : a Numpy array, with the value of the claims, in the same order as contracts in X_raw.
        A one dimensional array, with values either 0 (most entries) or >0.

    Returns
    -------
    self: this instance of the fitted model. This can be anything, as long as it is compatible
        with your prediction methods.

    """

    models = [
              RandomForestRegressor(n_estimators = 100,
                                    max_depth = 8,
                                    max_features = 'log2',
                                    min_samples_split = 200,
                                    random_state = 2021),
        
              GradientBoostingRegressor(n_estimators = 65,
                                        learning_rate = 0.5,
                                        max_depth = 1,
                                        loss = 'ls')
    ]

    bag = Bag(models)
    bag.fit(X_raw, y_raw)

    return bag

In [None]:
trained_model = fit_model(X_train, y_train)

In [None]:
def save_model(model_path):  # some models such xgboost models or keras models don't pickle very reliably. Please use the package provided saving functions instead. 
    with open(model_path, 'wb') as target_file:
        pickle.dump(trained_model, target_file)

In [None]:
save_model(Config.MODEL_OUTPUT_PATH)

In [None]:
def load_model(model_path): # some models such xgboost models or keras models don't pickle very reliably. Please use the package provided saving functions instead. 
    with open(model_path, 'rb') as target:
        return pickle.load(target)

In [None]:
trained_model = load_model(Config.MODEL_OUTPUT_PATH)

In [None]:
def predict_expected_claim(model, X_raw):
    """Model prediction function: predicts the expected claim based on the pricing model.

    This functions estimates the expected claim made by a contract (typically, as the product
    of the probability of having a claim multiplied by the expected cost of a claim if it occurs),
    for each contract in the dataset X_raw.

    This is the function used in the RMSE leaderboard, and hence the output should be as close
    as possible to the expected cost of a contract.

    Parameters
    ----------
    model: a Python object that describes your model. This can be anything, as long
        as it is consistent with what `fit` outpurs.
    X_raw : Pandas dataframe, with the columns described in the data dictionary.
        Each row is a different contract. This data has not been processed.

    Returns
    -------
    avg_claims: a one-dimensional Numpy array of the same length as X_raw, with one
        expected claim per contract (in same order). These expected claims must be POSITIVE (>0).
    """

    preds = model.predict(X_raw)
    preds[preds < 0] = 0
    
    return preds

In [None]:
predict_expected_claim(trained_model, X_train)

In [None]:
def predict_premium(model, X_raw):
    """Model prediction function: predicts premiums based on the pricing model.

    This function outputs the prices that will be offered to the contracts in X_raw.
    premium will typically depend on the average claim predicted in 
    predict_average_claim, and will add some pricing strategy on top.

    This is the function used in the average profit leaderboard. Prices output here will
    be used in competition with other models, so feel free to use a pricing strategy.

    Parameters
    ----------
    model: a Python object that describes your model. This can be anything, as long
        as it is consistent with what `fit` outpurs.
    X_raw : Pandas dataframe, with the columns described in the data dictionary.
        Each row is a different contract. This data has not been processed.

    Returns
    -------
    prices: a one-dimensional Numpy array of the same length as X_raw, with one
        price per contract (in same order). These prices must be POSITIVE (>0).
    """
    # Minimum price to offer
    base = 110
    # Scale predicted claims
    risk = 1.5

    claims = predict_expected_claim(model, X_raw)

    prices = claims * risk
    prices[prices < base] = base

    return prices

In [None]:
prices = predict_premium(trained_model, X_train)

In [None]:
print('Income:', prices.sum())
print('Losses:', y_train.sum())

if prices.sum() < y_train.sum():
    print('Your model loses money on the training data! It does not satisfy market rule 1: Non-negative training profit.')
    print('This model will be disqualified from the weekly profit leaderboard, but can be submitted for educational purposes to the RMSE leaderboard.')
else:
    print('Your model passes the non-negative training profit test!')

In [None]:
%aicrowd_submit