# Machine Learning - Group 64

## Team Members

* Martin Tischler - 
* Matias Johansen Vian - 494807
* Nicolas Roger Bon - 

<h2>LGBMRegression</h2>

LGBMRegression is a boosting framework that uses tree based learning algorithms. The model is recommended for large data sets of over 10 000 entries. 

In [21]:
import numpy
import pandas as pd 
import statsmodels.tsa.arima.model as arima
import lightgbm as lgb
import os
# import neptunecontrib.monitoring.skopt as sk_utils
from neptune.new.integrations.lightgbm import NeptuneCallback, create_booster_summary
# import skopt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


Loading datasets:

In [22]:
test_data = pd.read_csv(str(os.path.abspath('')) + '/data/beer_test.csv', parse_dates=['Date'])      # id,       Date,       ts_id,      isPromo
train_data = pd.read_csv(str(os.path.abspath('')) + '/data/beer_train.csv', parse_dates=['Date'])    # id,       Date,       ts_id,      isPromo,    Sales
stores = pd.read_csv(str(os.path.abspath('')) + '/data/id_store_sku.csv')      # ts_id,    Store,      SKU
features = pd.read_csv(str(os.path.abspath('')) + '/data/sku_features.csv')    # SKU,      Segment,    Pack,       Product,    Brand,  Volume

Generating feature sets:

In [23]:
def generate_features(df, sku_features, id_map):
    '''
    Feature generation.
    '''
    
    # Add metadata
    df = pd.merge(df, id_map, how='left', on='ts_id')
    df = pd.merge(df, sku_features, how='left', on='SKU')

    # Time features
    df['day_of_month'] = df['Date'].dt.day
    df['day_of_week'] = df['Date'].dt.weekday
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['week'] = df['Date'].dt.week
    
    # Enlarge promo features
    # Since we know that promo is important
    
    df['ts_promo'] = df['ts_id'].astype(str) + df['isPromo'].astype(str)
    df['store_promo'] = df['Store'].astype(str) + df['isPromo'].astype(str)
    df['segment_promo'] = df['Segment'].astype(str) + df['isPromo'].astype(str)
    df['brand_promo'] = df['Brand'].astype(str) + df['isPromo'].astype(str)
    df['sku_promo'] = df['SKU'].astype(str) + df['isPromo'].astype(str)
    
    df['dom_promo'] = df['day_of_month'].astype(str) + df['isPromo'].astype(str)
    df['dow_promo'] = df['day_of_week'].astype(str) + df['isPromo'].astype(str)
    
    return df

Regression function:

In [24]:
def regression(train_features, test_features):
    '''
    LGBM regression based on features.
    '''

    clf = lgb.LGBMRegressor(num_leaves= 8, max_depth=4, 
                        random_state=42, 
                        silent=True, 
                        metric='rmse', 
                        n_jobs=-1, 
                        n_estimators=1000,
                        colsample_bytree=0.95,
                        subsample=0.6,
                        learning_rate=0.05,
                        num_iterations=300)

    clf.fit(train_features.drop(columns = ['Sales', 'Date']), train_data['Sales'])

    prediction = clf.predict(test_features.drop(columns = ['id', 'Date']))

    return numpy.array(prediction)

Running:

In [25]:
train_features = generate_features(train_data, features, stores)
test_features = generate_features(test_data, features, stores)

# Generating categories
for c in test_features.drop(columns = ['Date', 'id']).columns:
    test_features[c] = test_features[c].astype('category')
    train_features[c] = train_features[c].astype('category')

predictions = regression(train_features, test_features)

print(predictions)

# TODO: Works in regular Python script, fix so it works here as well.

[0.097253   0.83780343 0.96660902 ... 0.15087151 0.69001251 0.17233502]
