The goal of this notebook is to try and accumulate all of the previous information gathered regarding the instagram engagements with other information, and see if there is a way to accumulate it into one space in order to make this into a deliverable product. 

This means the joining of the actual instagram dataset, and then the extra research performed on the hashtags and @'s attached in each description.

# Step 0 : Import dependencies and visualization tools

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('coolwarm')
%matplotlib inline

# Step 1 : Load in Instagram Dataset, Derive Preliminary Features that require no outside data. 

In [2]:
instas = pd.read_csv('Business Analytics/training_set.csv',encoding = 'unicode_escape')
instas.fillna('',inplace=True)

In [16]:
instas['Description_Len'] = instas['Description'].apply(len)
instas['num@s'] = instas['Description'].apply(lambda z: z.count('@'))
instas['num#s'] = instas['Description'].apply(lambda z: z.count('#'))
instas['num?s'] = instas['Description'].apply(lambda z: z.count('?')) # a shitty proxy for emojis
import re
instas['numCAPs'] = instas['Description'].apply(lambda z: len(re.findall(r'[A-Z]',z)))

def find_dunks(z):
    dunk_words = ['slam','jam','dunk','putback','rim','windmill','flush','oop']
    for dw in dunk_words:
        if dw in z.lower(): 
            return 1
    return 0

def find_buzzer_beaters(z):
    buzzer_beater_words = ['buzz','beat','clock','winner']
    for bbw in buzzer_beater_words:
        if bbw in z.lower():
            return 1
    return 0

instas['Buzzer Beater?'] = instas['Description'].apply(lambda z: find_buzzer_beaters(z)) #shitty proxy for buzzer beaters
instas['Dunk?'] = instas['Description'].apply(lambda z:  find_dunks(z)) #shitty proxy for buzzer beaters


import datetime

instas['Timezone'] = instas['Created'].str.split(' ',expand =True)[2]
instas['Date'] = instas['Created'].str.split(' ',expand = True)[0]
instas['Month'] = instas['Date'].str.split('-',expand = True)[1]

instas['Time'] = instas['Created'].str.split(' ',expand = True)[1]
instas['Hour'] = instas['Time'].str.split(':',expand=True)[0]

instas['Date'] = pd.to_datetime(instas['Date']).astype(datetime.datetime)

instas['Weekday'] = instas['Date'].apply(lambda z: z.weekday())

instas['Year'] = instas['Created'].apply(lambda z: z.split('-')[0])

In [4]:
#utilitiy functions before data processing. 
def find_dunks(z):
    dunk_words = ['slam','jam','dunk','putback','rim','windmill','flush','oop']
    for dw in dunk_words:
        if dw in z.lower(): 
            return 1
    return 0

def find_buzzer_beaters(z):
    buzzer_beater_words = ['buzz','beat','clock','winner']
    for bbw in buzzer_beater_words:
        if bbw in z.lower():
            return 1
    return 0


def process_data(file,training=False):
    """Loads in file, either the training or holdout set, and transforms it into the dataframe we want using the variable 
    changes based solely on that particular dataframe, as well as based on exogenous features such as profiles tagged, and hashtags used. 
    
    Parameters
    ----------
    
    file : csv 
        Csv file of the instagram posts. 
        
    training : bool
        Whether or not the inputfile is the training set. If true, this will also have attached a y output, in addition to X. 
        
    
    Returns
    --------
    
    X : dataframe
        ML ready dataframe of the associated features used to capture engagmeent rates. 
        
        Further documentation of each feature provided below. 
        
    y : optional return Dataframe
        For training purposes only, the output values of engagment to train ML model(s) on. 
        
    """
    
    import pandas as pd
    
    instas = pd.read_csv(file,encoding = 'unicode_escape')
    instas.fillna('',inplace=True)
    
    instas['Description_Len'] = instas['Description'].apply(len)
    instas['num@s'] = instas['Description'].apply(lambda z: z.count('@'))
    instas['num#s'] = instas['Description'].apply(lambda z: z.count('#'))
    instas['num?s'] = instas['Description'].apply(lambda z: z.count('?')) # a shitty proxy for emojis
    import re
    instas['numCAPs'] = instas['Description'].apply(lambda z: len(re.findall(r'[A-Z]',z)))

    instas['Buzzer Beater?'] = instas['Description'].apply(lambda z: find_buzzer_beaters(z)) #shitty proxy for buzzer beaters
    instas['Dunk?'] = instas['Description'].apply(lambda z:  find_dunks(z)) #shitty proxy for buzzer beaters


    import datetime

    instas['Timezone'] = instas['Created'].str.split(' ',expand =True)[2]
    instas['Date'] = instas['Created'].str.split(' ',expand = True)[0]
    instas['Month'] = instas['Date'].str.split('-',expand = True)[1]

    instas['Time'] = instas['Created'].str.split(' ',expand = True)[1]
    instas['Hour'] = instas['Time'].str.split(':',expand=True)[0]

    instas['Date'] = pd.to_datetime(instas['Date']).astype(datetime.datetime)

    instas['Weekday'] = instas['Date'].apply(lambda z: z.weekday())
    instas['Year'] = instas['Created'].apply(lambda z: z.split('-')[0])
    
    
    if training:
        X = instas.drop(columns=['Engagements','Description','Created','Followers at Posting']).copy()
        y = instas['Engagements']
    else:   
        X = instas.drop(columns=['Description','Created','Followers at Posting']).copy()
        
    X = pd.get_dummies(X)
    
    if training:
        return X,y
    #otherwise, 
    return X
    

In [5]:
X_pred = process_data('Business Analytics/holdout_set.csv')

# Step 2 Load In Exogeneous Features Such as Hashtag Popularity and Profile Clout

In [6]:
# Please do this here soon!

In [23]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

X = instas.drop(columns=['Date','Time','Engagements','Description','Created','Followers at Posting']).copy()
X = pd.get_dummies(X)

y = instas['Engagements']

In [24]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

model = xgb.XGBRegressor(n_estimators=500,booster='dart')#choice(models)

print("Model used: ", model)
model.fit(X_train,y_train)
print("Training score: ", model.score(X_train,y_train))
print("Original Score", model.score(X_test,y_test))

def mean_absolute_percentage_error(y_true, y_pred):
    import numpy as np
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

y_true = y_test
y_pred = model.predict(X_test)

#y_pred = np.array([round(y) for y in y_pred])

print("MAPE Original Score ", mean_absolute_percentage_error(y_true, y_pred))


Model used:  XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Training score:  0.9314384207415729
Original Score 0.9066032298119966
MAPE Original Score  6.44393466427359


In [25]:
X

Unnamed: 0,Description_Len,num@s,num#s,num?s,numCAPs,Buzzer Beater?,Dunk?,Weekday,Type_Album,Type_Photo,...,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Year_2017,Year_2018,Year_2019
0,95,4,0,0,2,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
1,64,2,0,0,3,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
2,46,2,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
3,43,0,0,0,4,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
4,57,1,0,0,3,0,1,1,0,0,...,0,0,0,1,0,0,0,0,0,1
5,104,2,0,0,8,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
6,106,1,2,0,11,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
7,88,2,0,2,4,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
8,75,2,0,0,10,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
9,121,2,1,0,5,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
