In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sb
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from collections import Counter
from sklearn.metrics import silhouette_score

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.feature_selection import SelectFromModel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score

In [2]:
train_data = pd.read_csv('train.csv')
pred_data = pd.read_csv('promotion_schedule.csv')

In [3]:
train_data.head()

Unnamed: 0,i,j,t,price,advertised
0,4,7,0,2.137451,0
1,6,1,0,0.863341,0
2,8,6,0,0.799155,0
3,8,25,0,3.023893,0
4,9,6,0,0.799155,0


In [15]:
len(train_data) == len(train_data_update)

True

## Clean, Add & Split Data

In [48]:
# create past add lookup
ad_lookup = pd.DataFrame(columns=['j', 't', 'ad'])
for j in range(40):
    for t in range(49):
        foo = train_data[ (train_data.j == j) & (train_data.t == t) ]
        if len(foo) == 0:
            ad_lookup = ad_lookup.append({'j': j, 't': t, 'ad': 0}, ignore_index=True)
        else:
            for index, row in foo.iterrows():
                ad = row['advertised']
                ad_lookup = ad_lookup.append({'j': j, 't': t, 'ad': int(ad)}, ignore_index=True)
                break

In [None]:
# add 0 values
train_data_update = train_data.copy()
for i in range(2000):
    for j in range(40):
        for t in range(49):
            if len(train_data[ (train_data.i == i) & (train_data.j == j) & (train_data.t == t) ]) == 0:
                ad = ad_lookup[ (ad_lookup.j == j) & (ad_lookup.t == t)].ad.values[0]
                train_data_update = train_data_update.append({'i': i, 'j': j, 't': t, 'price': 0, 'advertised': ad}, ignore_index=True)

In [None]:
train_data_update.to_pickle('train_data_update.pkl')

In [None]:
# dummy the categories
foo = pd.get_dummies(train_data_update, columns=['j', 'i'])

In [None]:
# split into X and y
y = foo.pop('price')
X = foo
del foo

In [None]:
def time_split(X, y, test_start_time):
    '''
    Splits data based on a sinlge point in time
    '''
    train_index = X.t < test_start_time
    test_index = X.t >= test_start_time
    X_train, y_train = X[ train_index], y[ train_index]
    X_test, y_test = X[ test_index ], y[ test_index ]
    print('Data used to test: {} %'.format(round(len(y_test) / ( len(y_test) + len(y_train) ), 3) * 100))
    return X_train, X_test, y_train, y_test

In [None]:
# split data
split_by_time = True
if split_by_time:
    X_train, X_test, y_train, y_test = time_split(X, y, 39)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Cluster Customers

In [None]:
def kmeans(X_km, clusters):
    SSE_arr = []
    ss_arr = []
    for i in clusters:
        kmeans = KMeans(n_clusters=i, n_jobs=-1)
        clust_dist = kmeans.fit_transform(X_km)
        clust_num = kmeans.predict(X_km)

        SSE = 0
        for a, b in zip(clust_dist, clust_num):
            SSE += a[b] ** 2
        SSE_arr.append(SSE)

        if i > 1:
            ss_arr.append(silhouette_score(X_km, clust_num))
    return SSE_arr, ss_arr

In [None]:
def elbow_plot(SSE_arr, clusters):
    plt.figure(figsize=(12,8))
    plt.title('Elbow Plot')
    plt.plot(clusters, SSE_arr)
    plt.grid(alpha=0.3)
    plt.xticks(clusters)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Sum of Squares Error (SSE)')

In [None]:
def silhouette_plot(ss_arr, clusters):
    plt.figure(figsize=(12,8))
    plt.title('Silhouette Scores')
    plt.plot(clusters, ss_arr)
    plt.grid(alpha=0.3)
    plt.xticks(clusters)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')

In [None]:
columns = ['j_{}'.format(i) for i in range(0, 40)]
customer_df = pd.DataFrame(np.zeros((2000, 40)), columns=columns)

In [None]:
for index, row in train_data.iterrows():
    price = row['price']
    j = int(row['j'])
    i = int(row['i'])
    customer_df.iloc[i]['j_{}'.format(j)] += price

In [None]:
num_clusts = 100
clusters = np.arange(1, num_clusts)
sil_clusters = np.arange(2, num_clusts)
SSE_arr, ss_arr = kmeans(customer_df, clusters)
elbow_plot(SSE_arr, clusters)
silhouette_plot(ss_arr, sil_clusters)

In [None]:
kmeans = KMeans(n_clusters=7)
pred = kmeans.fit_predict(customer_df)

In [None]:
pca = PCA(2).fit_transform(customer_df)
plt.scatter(pca[:,0], pca[:,1], c=pred);

In [None]:
Counter(pred)

## Model

In [None]:
def test_acc(model, X_train, X_test, y_train, y_test):
    '''
    Determine negative mean absolute error for test data
    '''
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    return -abs(np.array(pred) - np.array(y_test)).mean()

In [None]:
def class_crossval_plot(X_train, X_test, y_train, y_test, models, splits=6, scoring='neg_mean_absolute_error'):
    """
    Create violin plot of multiple models' test scores
    Inputs:
        X - dataframe features
        y - dataframe target column
        models - list of sklearn models to test
        scoring - measure of best fit for models to use
    """
    results = []
    names = []
    all_scores = []
    print('Mod - Avg - Std Dev')
    print('---   ---   -------')
    for model in models:
        name = model.__class__.__name__
        kfold = KFold(n_splits=splits)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=-1)
        results.append(cv_results)
        names.append(name)
        print('{}: {:.4f} ({:4f})'.format(name, cv_results.mean(), cv_results.std()))
        print('Test acc: {:.4f}'.format(test_acc(model, X_train, X_test, y_train, y_test)))
        print()

    fig = plt.figure(figsize=(16, 10))
    plt.tight_layout()
    fig.suptitle('Cross Validation Comparison of Regression Models')
    ax = fig.add_subplot(111)
    sb.violinplot(data=results, orient='v')
    ax.set_xticklabels(names, rotation=50, ha='right')
    ax.set_xlabel('Model')
    plt.grid(alpha=0.4)

In [None]:
# Initial Cross Validation
models = []
# models.append(Ridge())
# models.append(KNeighborsRegressor(n_jobs=-1))
# models.append(DecisionTreeRegressor())
models.append(RandomForestRegressor(n_jobs=3))
models.append(ExtraTreesRegressor(n_jobs=3))
# models.append(GradientBoostingRegressor())
# models.append(MLPRegressor())

class_crossval_plot(X_train, X_test, y_train, y_test, models)

In [None]:
def clust_grid(model, params, X_train, y_train):
    """
    Grid search over model
    Inputs:
        model - sklearn model to use (ie Lasso())
        params - parameter grid to search over for each model
        X_train - features to train model with
        y_train - targets to validate model with
    Returns:
        list of the best parameters found by the grid search
    """
    test_model = model
    grid = GridSearchCV(test_model, param_grid=params, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    return grid.best_params_, grid.best_score_

In [None]:
# grid search RF
rf = RandomForestRegressor()
params = {'n_estimators': [10, 30], 'max_features': ['auto', 0.3], 'min_samples_split': [2, 4], 'n_jobs': [-1]}
best_params, best_score = clust_grid(rf, params, X_train, y_train)
print(best_params)
print(best_score)

In [None]:
# grid search ET
et = ExtraTreesRegressor()
params = {'n_estimators': [10, 30], 'max_features': ['auto', 0.3], 'n_jobs': [-1]}
best_params, best_score = clust_grid(et, params, X_train, y_train)
print(best_params)
print(best_score)

## Final Model & Predictions

In [None]:
pred_data = pred_data.set_index('j')

In [None]:
pred_data.head()

In [None]:
week_50 = pd.DataFrame(columns=['i', 'j', 't', 'advertised'])
t = 49 # first week is 0, so 49 represents week 50
for i in range(0, 2000):
    for index, row in pred_data.iterrows():
        advertised = row['advertised']
        week_50 = week_50.append({'i': int(i), 'j': int(index), 't': int(t), 'advertised': int(advertised)}, ignore_index=True)

In [None]:
dummy_week_50 = pd.get_dummies(week_50, columns=['j', 'i'])

In [None]:
dummy_week_50.head()

In [None]:
et_model = ExtraTreesRegressor(n_estimators=30, max_features='auto', n_jobs=-1)
rf_model = RandomForestRegressor(n_estimators=30, min_samples_split=4, max_features='auto', n_jobs=-1)

et_model.fit(X, y)
rf_model.fit(X, y)

In [None]:
et_weight = 0.5
et_pred = et_model.predict(dummy_week_50)
rf_pred = rf_model.predict(dummy_week_50)
final_pred = et_weight * et_pred + (1 - et_weight) * rf_pred

In [None]:
with open('final_predictions.csv', 'w+') as f:
    f.write('i,j,prediction\n')
    for index, row in week_50.iterrows():
        i = row['i']
        j = row['j']
        pred = final_pred[index]
        f.write('{},{},{}\n'.format(i, j, pred))

In [None]:
foo = train_data[ train_data.i == 0 ]
for j in foo.j.unique():
    bar = foo[ foo.j == j ]
    plt.plot(bar.t, bar.price, label=j)
plt.legend()
plt.grid(alpha=0.4)

In [None]:
train_data.head()