In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sb
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.feature_selection import SelectFromModel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score

In [2]:
train_data = pd.read_csv('train.csv').sample(frac=0.3)
#pred_data = pd.read_csv('promotion_schedule.csv')

## Clean & Split Data

In [3]:
# dummy the categories
train_data = pd.get_dummies(train_data, columns=['j', 'i'])

In [4]:
# split into X and y
foo = train_data.copy()
y = foo.pop('price')
X = foo
del foo

In [5]:
def time_split(X, y, test_start_time):
    '''
    Splits data based on a sinlge point in time
    '''
    train_index = X.t < test_start_time
    test_index = X.t >= test_start_time
    X_train, y_train = X[ train_index], y[ train_index]
    X_test, y_test = X[ test_index ], y[ test_index ]
    print('Data used to test: {} %'.format(round(len(y_test) / ( len(y_test) + len(y_train) ), 3) * 100))
    return X_train, X_test, y_train, y_test

In [6]:
# split data
split_by_time = True
if split_by_time:
    X_train, X_test, y_train, y_test = time_split(X, y, 39)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Data used to test: 20.5 %


## Model

In [10]:
def test_acc(model, X_train, X_test, y_train, y_test):
    '''
    Determine negative mean absolute error for test data
    '''
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    return -abs(np.array(pred) - np.array(y_test)).mean()

In [11]:
def class_crossval_plot(X_train, X_test, y_train, y_test, models, splits=5, scoring='neg_mean_absolute_error'):
    """
    Create violin plot of multiple models' test scores
    Inputs:
        X - dataframe features
        y - dataframe target column
        models - list of sklearn models to test
        scoring - measure of best fit for models to use
    """
    results = []
    names = []
    all_scores = []
    print('Mod - Avg - Std Dev')
    print('---   ---   -------')
    for model in models:
        name = model.__class__.__name__
        kfold = KFold(n_splits=splits)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=-1)
        results.append(cv_results)
        names.append(name)
        print('{}: {:.4f} ({:4f})'.format(name, cv_results.mean(), cv_results.std()))
        print('Test acc: {:.4f}'.format(test_acc(model, X_train, X_test, y_train, y_test)))
        print()

    fig = plt.figure(figsize=(16, 10))
    plt.tight_layout()
    fig.suptitle('Cross Validation Comparison of Regression Models')
    ax = fig.add_subplot(111)
    sb.violinplot(data=results, orient='v')
    ax.set_xticklabels(names, rotation=50, ha='right')
    ax.set_xlabel('Model')
    plt.grid(alpha=0.4)

In [None]:
# Initial Cross Validation
models = []
# models.append(LinearRegression())
models.append(Ridge())
models.append(Lasso())
models.append(ElasticNet())
models.append(KNeighborsRegressor(n_jobs=-1))
models.append(DecisionTreeRegressor())
models.append(RandomForestRegressor())
models.append(GradientBoostingRegressor())
models.append(MLPRegressor())

class_crossval_plot(X_train, X_test, y_train, y_test, models)

Mod - Avg - Std Dev
---   ---   -------
Ridge: -0.0307 (0.000616)
Test acc: -0.0313

Lasso: -0.6459 (0.006741)
Test acc: -0.6415

ElasticNet: -0.6459 (0.006741)
Test acc: -0.6415

KNeighborsRegressor: -0.0346 (0.002015)
Test acc: -0.1378

DecisionTreeRegressor: -0.0003 (0.000269)
Test acc: -0.0100

RandomForestRegressor: -0.0006 (0.000233)
Test acc: -0.0088

GradientBoostingRegressor: -0.1461 (0.001870)
Test acc: -0.1512



In [None]:
LinearRegression().__class__.__name__

In [None]:
for j in train_data.j.unique():
    foo = train_data[ train_data.j == j]
    plt.plot(foo.t, foo.price)