In [99]:
## import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [100]:
# read in data
train = pd.read_csv('training_set.csv',encoding = 'unicode_escape', parse_dates=True)
#test = pd.read_csv('holdout_set.csv',encoding = 'unicode_escape', parse_dates=True)

In [101]:
# convert date to datetime
train['Created'] = pd.to_datetime(train['Created'])
train.dtypes



Engagements                      int64
Followers at Posting             int64
Created                 datetime64[ns]
Type                            object
Description                     object
dtype: object

In [102]:
# create date based fields
train['weekday'] = train['Created'].apply(lambda x: x.weekday())
train['weekend'] = np.where(train['weekday'] >= 4,'Y','N')
train['hour'] = train['Created'].dt.hour
train['timeOfDay'] = np.where((train['hour'] >= 2) & (train['hour'] <=11) ,'Morning',
                              np.where((train['hour'] >= 12) & (train['hour'] <=15),'Afternoon', np.where(
                                  (train['hour'] >= 16) & (train['hour'] <=19),'Evening','Night')))
train['mon'] = train['Created'].dt.month
train['offseason'] = np.where((train['mon'] >= 7) & (train['mon'] <=9) ,'Y','N')

In [6]:
train.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description,weekday,weekend,hour,timeOfDay,mon,offseason
0,502093,36984682,2019-05-21 23:30:51,Video,The @raptors bench trio of @sergeibaka @norman...,1,N,23,Night,5,N
1,603380,36984682,2019-05-21 22:53:33,Video,@kyle_lowry7 pulls from deep for the @raptors ...,1,N,22,Night,5,N
2,603380,36984682,2019-05-21 22:19:58,Video,@k_mid22 with some english on the @bucks dime!,1,N,22,Night,5,N
3,725100,36984682,2019-05-21 22:02:41,Video,Kawhi punches it home with the left on TNT!,1,N,22,Night,5,N
4,661446,36984682,2019-05-21 20:47:49,Video,@giannis_an34 goes baseline early to rock the ...,1,N,20,Night,5,N


In [103]:
# create lowercase caption for text based features
train['DescLower'] = train['Description'].str.lower()

In [104]:
## create caption based fields
train['CapLength'] = train['Description'].str.len().fillna(0) # length of caption
# list of most popular players based on jersey sales from 2018 and 2019, including instagram specific names
popPlayerRegex = 'lebron|kingjames|curry|stephencurry|giannis|kyrie|embiid|harden|westbrook|russwest44|durant|easymoneysniper|porzingis|kporzee|simmons|bensimmons|dwyane|kawhi'
train['popPlayer'] = train['DescLower'].str.contains(popPlayerRegex,regex=True)*1
# list of most popular teams based on juersey sales from 2018 and 2019
popTeamRegex = 'lakers|warriors|celtics|knicks|sixers|bucks|thunder|rockets|raptors|cavs'
train['popTeam'] = train['DescLower'].str.contains(popTeamRegex,regex=True)*1
# post is tagged at another insta handle
train['at'] = train['DescLower'].str.contains('@',regex=True)*1
# post contains a hashtag
train['hash'] = train['DescLower'].str.contains('#',regex=True)*1
# tries to capture games on national tv
natlTvRegex = 'tnt|espn|abc|nbatv'
train['natlTV'] = train['DescLower'].str.contains(natlTvRegex,regex=True)*1
# posts about an even such as allstar game or playoffs or draft
train['event'] = train['DescLower'].str.contains('allstar|all-star|finals|playoff|draft|slamdunk',regex=True)*1

In [105]:
train.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description,weekday,weekend,hour,timeOfDay,mon,offseason,DescLower,CapLength,popPlayer,popTeam,at,hash,natlTV,event
0,502093,36984682,2019-05-21 23:30:51,Video,The @raptors bench trio of @sergeibaka @norman...,1,N,23,Night,5,N,the @raptors bench trio of @sergeibaka @norman...,95.0,0,1,1,0,0,0
1,603380,36984682,2019-05-21 22:53:33,Video,@kyle_lowry7 pulls from deep for the @raptors ...,1,N,22,Night,5,N,@kyle_lowry7 pulls from deep for the @raptors ...,64.0,0,1,1,0,1,0
2,603380,36984682,2019-05-21 22:19:58,Video,@k_mid22 with some english on the @bucks dime!,1,N,22,Night,5,N,@k_mid22 with some english on the @bucks dime!,46.0,0,1,1,0,0,0
3,725100,36984682,2019-05-21 22:02:41,Video,Kawhi punches it home with the left on TNT!,1,N,22,Night,5,N,kawhi punches it home with the left on tnt!,43.0,1,0,0,0,1,0
4,661446,36984682,2019-05-21 20:47:49,Video,@giannis_an34 goes baseline early to rock the ...,1,N,20,Night,5,N,@giannis_an34 goes baseline early to rock the ...,57.0,1,0,1,0,1,0


## modeling

In [106]:
# variables used in model, inclduing response variable
train_features = train[['Engagements','Type','weekend','offseason','timeOfDay','CapLength','popPlayer','popTeam','at','hash','natlTV','event']]

In [107]:
train_features.head()

Unnamed: 0,Engagements,Type,weekend,offseason,timeOfDay,CapLength,popPlayer,popTeam,at,hash,natlTV,event
0,502093,Video,N,N,Night,95.0,0,1,1,0,0,0
1,603380,Video,N,N,Night,64.0,0,1,1,0,1,0
2,603380,Video,N,N,Night,46.0,0,1,1,0,0,0
3,725100,Video,N,N,Night,43.0,1,0,0,0,1,0
4,661446,Video,N,N,Night,57.0,1,0,1,0,1,0


In [108]:
# one hot encode categorical variables and create dataframe used in modeling

# categorical columns
cat_columns = ['Type','weekend','offseason','timeOfDay','popPlayer','popTeam','at','hash','natlTV','event']
train_features_model = pd.get_dummies(data=train_features, columns=cat_columns)

train_features_model.head()

Unnamed: 0,Engagements,CapLength,Type_Album,Type_Photo,Type_Video,weekend_N,weekend_Y,offseason_N,offseason_Y,timeOfDay_Afternoon,...,popTeam_0,popTeam_1,at_0,at_1,hash_0,hash_1,natlTV_0,natlTV_1,event_0,event_1
0,502093,95.0,0,0,1,1,0,1,0,0,...,0,1,0,1,1,0,1,0,1,0
1,603380,64.0,0,0,1,1,0,1,0,0,...,0,1,0,1,1,0,0,1,1,0
2,603380,46.0,0,0,1,1,0,1,0,0,...,0,1,0,1,1,0,1,0,1,0
3,725100,43.0,0,0,1,1,0,1,0,0,...,1,0,1,0,1,0,0,1,1,0
4,661446,57.0,0,0,1,1,0,1,0,0,...,1,0,0,1,1,0,0,1,1,0


In [110]:
## create features and response data frames
features = train_features_model[train_features_model.columns.tolist()[1:]]
response = train_features_model['Engagements']

## cross validation shell..

In [111]:
# get CV indices
def getIndices(K):
    kf = KFold(n_splits=K, shuffle=True)
    train_ind = []
    test_ind = []

    for train_index, test_index in kf.split(train_features_model):
        train_ind.append(train_index)
        test_ind.append(test_index)
    return train_ind, test_ind

In [112]:
## MAPE function
## calculated for each "column" of predicted y values matrix corresponding to a different combo of params from CV
def calc_mape(y_pred,y_true):
    m = []
    for i in range(0,y_pred.shape[1]):
        mape = np.mean(np.abs((y_true - y_pred[:,i]) / y_true)) * 100
        m.append(mape)
    return m


### boosted tree tuning

In [123]:
## parameters
## initially tuning lambda = .001, .01, .1 (and adjusted ranges above and below .1 after initial results)
## initially tuning trees = 50, 100, 200 (and adjusted ranges above and below 100 after initial results)
## initially tuning max_depth = 2,3,4,5,6,7,8
lambdas = [.085,.09,.095]
n_tree = [105,110,115]
max_depth = [3,4]

np.random.seed(132) 
## number of folds and reps
K = 10
Nrep = 5
## number of models to be tested
n_models = len(lambdas)*len(n_tree)*len(max_depth)

## initialized empty matrices for MAPE, models (combo of parameters), and predicted y values
MAPE  = np.zeros((Nrep,n_models))
models = np.zeros((n_models,3))
y_pred_bs = np.zeros((train_features_model.shape[0],n_models))
## actual engagements
y_act = response.tolist()


## multiple reps of K-fold to ultimately populated MAPE matrix which will be Nrep x Nmodels dimension
for nr in range(0,Nrep):
    x = getIndices(K)
    for k in range(0,K):
        i = 0
        for l in lambdas:
            for t in n_tree:
                for m in max_depth:
                    models[i] = [l,t,m]
                    b = GradientBoostingRegressor(learning_rate=l, n_estimators=t, max_depth=m)
                    b.fit(features.iloc[x[0][k].tolist(),], response.iloc[x[0][k].tolist(),])
                    y_pred_bs[x[1][k].tolist(),i] = b.predict(features.iloc[x[1][k].tolist(),]).tolist()
                    i +=1
    MAPE[nr,:] = calc_mape(y_pred_bs,y_act)


In [124]:
## calculate the average CV MAPE per model tested
mape_total = MAPE.mean(0)
# find the best MAPE score
best_mape = min(mape_total)
## find the paramters corresponding to the min (best) MAPE
best_l = models[np.argmin(mape_total)][0]
best_t = int(models[np.argmin(mape_total)][1])
best_m = int(models[np.argmin(mape_total)][2])
# print the best model
print("Best Model:  " + "lambda=" + str(best_l) + ", ntrees=" + str(best_t) + ", max_depth=" + str(best_m) + ", MAPE=" + str(best_mape))

Best Model:  lambda=0.095, ntrees=105, max_depth=4, MAPE=9.324757584058904


In [80]:
## fit best boosted tree on data split into test train

In [125]:
X_train, X_test, y_train, y_test = train_test_split(features, response, test_size=0.2, random_state=132)

In [126]:
b = GradientBoostingRegressor(learning_rate=best_l, n_estimators=best_t, max_depth=best_m)
b.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.095, loss='ls', max_depth=4,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=105,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [127]:
# predict on test set
pred = b.predict(X_test)

In [128]:
# calculate MAPE
np.mean(np.abs((y_test - pred) / y_test)) * 100

9.25709170404691

In [129]:
# feature importance
df = pd.DataFrame({'features':features.columns.tolist(),'impt':b.feature_importances_.tolist()})
df

Unnamed: 0,features,impt
0,CapLength,0.339958
1,Type_Album,0.005159
2,Type_Photo,0.011027
3,Type_Video,0.263596
4,weekend_N,0.019686
5,weekend_Y,0.015754
6,offseason_N,0.028462
7,offseason_Y,0.024007
8,timeOfDay_Afternoon,0.014368
9,timeOfDay_Evening,0.009077


### linear regression

In [121]:
K = 10
Nrep = 5
n_models = 1

MAPE  = np.zeros((Nrep,n_models))
y_pred_lr = np.zeros((train_features_model.shape[0],n_models))
y_act = response.tolist()



for nr in range(0,Nrep):
    x = getIndices(K)
    for k in range(0,K):
        i = 0
        
        lr = LinearRegression()
        lr.fit(features.iloc[x[0][k].tolist(),], response.iloc[x[0][k].tolist(),])
        y_pred_lr[x[1][k].tolist(),i] = lr.predict(features.iloc[x[1][k].tolist(),]).tolist()
        i +=1
    #fscore=score(y_test,y_pred_bs)
    MAPE[nr,:] = calc_mape(y_pred_lr,y_act)

In [122]:
np.mean(MAPE)

10.205976026950305