In [1]:
import pandas as pd
pd.set_option('max_colwidth', 80)

### Merging multiple csvs

In [2]:
import glob
df = pd.DataFrame()
for file in glob.glob("*ig.csv"):
    brand = file.split("_ig.csv")[0]
    temp_df = pd.read_csv(file)
    if brand == "samsung":
        brand = "samsungmobile"
    temp_df["brand"] = brand
    df = pd.concat([df,temp_df], axis=0)

In [3]:
def handle_numbers(string):
    string = str(string).replace(",","")
    if "k" in string:
        if "." in string:
            return int(string.replace("k","00").replace(".",""))
        else:
            return int(string.replace("k","00"))
    if "m" in string:
        if "." in string:
            return int(string.replace("m","00000").replace(".",""))
        else:
            return int(string.replace("m","000000"))
    return int(float(string))
        

In [4]:
# only choose images
df = df[df.media=="image"]

In [5]:
df.likes = df.likes.apply(handle_numbers)

In [6]:
df = df[df.caption.notnull()]

### Feature Engineering

In [7]:
from sklearn.pipeline import make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

In [8]:
def get_text(df):
    return df.caption

In [9]:
get_text_ft = FunctionTransformer(get_text, validate=False)

Get page meta data

In [10]:
meta_df = pd.read_csv("brand_ig_data.csv")

In [11]:
meta_df = meta_df.loc[:, ["brand_ig","n_posts", "n_followers","n_following"]]

In [12]:
df.rename(index=str, columns={"brand": "brand_ig"}, inplace=True)

In [13]:
final_df = pd.merge(df,meta_df,how="left",on="brand_ig")

In [14]:
final_df["n_posts"] = final_df.n_posts.apply(handle_numbers)

In [15]:
final_df["n_followers"] = final_df.n_followers.apply(handle_numbers)

In [16]:
# Remove posts with zero likes (to take a log transform and calculate MAPE)
final_df = final_df[final_df.likes != 0]

In [17]:
def get_brand_meta(df):
    return df.loc[:,["n_posts", "n_followers"]]

In [18]:
get_brand_meta_ft = FunctionTransformer(get_brand_meta, validate=False)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english', max_df=0.3, min_df=4)

In [20]:
union = make_union(make_pipeline(get_text_ft, vect), get_brand_meta_ft)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_df, final_df.likes, random_state=1)

In [22]:
X_train_meta_dtm = union.fit_transform(X_train)

In [23]:
X_test_meta_dtm = union.transform(X_test)

### Model Selection

In [24]:
import sklearn.linear_model
import sklearn.datasets
import sklearn.svm
from sklearn.metrics import r2_score
import sklearn.feature_extraction.text
import sklearn.utils.sparsefuncs

In [25]:
import numpy as np

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred)/ y_true)) * 100

Standard LR

In [26]:
regression = sklearn.linear_model.LinearRegression()
regression.fit(X_train_meta_dtm, np.log(y_train))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
yhat_lr = regression.predict(X_test_meta_dtm)

In [28]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_lr)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_lr))

r2: 0.697
mape: 20.260


lasso, L1 penalized regression

In [29]:
lasso = sklearn.linear_model.Lasso(alpha = 0.1, max_iter=3000)

In [30]:
lasso.fit(X_train_meta_dtm,np.log(y_train))

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=3000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [31]:
yhat_lasso = lasso.predict(X_test_meta_dtm)

In [32]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_lasso)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_lasso))

r2: 0.333
mape: 33.447


Adaboost

In [33]:
from sklearn.ensemble import AdaBoostRegressor

In [34]:
ada = AdaBoostRegressor(base_estimator=None, n_estimators=200, learning_rate=1, loss="linear", random_state=1)

In [35]:
ada.fit(X_train_meta_dtm, np.log(y_train))

AdaBoostRegressor(base_estimator=None, learning_rate=1, loss='linear',
         n_estimators=200, random_state=1)

In [36]:
yhat_ada = ada.predict(X_test_meta_dtm)

In [37]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_ada)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_ada))

r2: 0.667
mape: 21.684


Gradient Boosted Trees

In [38]:
from sklearn.ensemble import GradientBoostingRegressor

In [39]:
gbr = GradientBoostingRegressor(loss="ls", learning_rate=0.1, n_estimators=100, subsample=1.0, criterion="friedman_mse", min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=1, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort="auto")

In [40]:
gbr.fit(X_train_meta_dtm, np.log(y_train))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=1,
             subsample=1.0, verbose=0, warm_start=False)

In [41]:
yhat_gbr = gbr.predict(X_test_meta_dtm)

In [42]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_gbr)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_gbr))

r2: 0.797
mape: 15.978


### Model Interpretation

In [43]:
threshold = 1.0e-03

In [44]:
# Ignore the long tail
gbr.feature_importances_[gbr.feature_importances_ >= threshold].sum()

0.97270708960411267

In [45]:
impt_feats = gbr.feature_importances_ >= threshold

In [46]:
impt_ind = np.where(impt_feats[:-2])[0]

In [51]:
np.array(vect.get_feature_names())[impt_feats[:-2]]

array(['777tour', 'a1', 'acer', 'acerwin10', 'available', 'bea1', 'bio',
       'capture', 'capturedonp9', 'ces', 'ces2014', 'cute', 'debut',
       'domore', 'dual', 'elifee6', 'experience', 'film', 'find5',
       'flipkart', 'flyme', 'fullvision', 'g2', 'gioneee7',
       'gioneestargima', 'guys', 'gwatchr', 'hewlettpackard', 'howiflex',
       'hp', 'htcgoesfullfrontal', 'htcone', 'htcrihanna', 'htcu11',
       'huaweimate10', 'huaweip10', 'huaweip8', 'ifa15', 'introducing',
       'lenovo', 'lgg6', 'lgv30', 'lgv30sthinq', 'lounge', 'makesmiles',
       'meizu', 'mi', 'moto360', 'motox', 'motoxmade', 'mymotox',
       'mysuperg', 'new', 'nokia', 'nyfw', 'oneplus5t', 'oppofind5',
       'oppon1', 'optimus', 'osnap', 'p7max', 'prefer', 'rai', 'redmi',
       'redminote3', 's11', 'selfieexpert', 'selfieflash', 'selfiestan',
       'share', 'shotononeplus', 'sony', 'sonycamera', 'sonynex', 'tablet',
       'taken', 'thank', 'u11', 'v20', 'vivomobile',
       'weekenderwithmicromax', 'x

In [52]:
vect.fit(X_train.caption)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=None, min_df=4,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [49]:
gbr.feature_importances_[impt_feats].size

86

In [53]:
importance_df = pd.DataFrame()

In [54]:
importance_df["word"] = np.array(vect.get_feature_names())[impt_feats[:-2]]

In [56]:
importance_df["importance"] = gbr.feature_importances_[impt_feats][:-2]

In [58]:
importance_df = importance_df.sort_values(by="importance",ascending=False)

In [59]:
meta_importance_df = pd.DataFrame([["n_followers",0.24085896],["n_posts",0.08033263,]],columns=["word","importance"])

In [66]:
pd.concat([meta_importance_df,importance_df],axis = 0).reset_index(drop=True).head(20)

Unnamed: 0,word,importance
0,n_followers,0.240859
1,n_posts,0.080333
2,meizu,0.028165
3,gioneee7,0.022344
4,gioneestargima,0.019738
5,shotononeplus,0.018512
6,mi,0.017792
7,xiaomipics,0.017215
8,htcgoesfullfrontal,0.017102
9,fullvision,0.016946


### Select only useful features and use all data in final fitting

In [67]:
from sklearn.feature_selection import SelectFromModel

In [68]:
model = SelectFromModel(gbr, prefit=True)
X_train_new = model.transform(X_train_meta_dtm)
X_test_new = model.transform(X_test_meta_dtm)

In [69]:
X_final = np.vstack([X_train_new.toarray(),X_test_new.toarray()])
y_final = np.concatenate([y_train.values, y_test.values])

In [70]:
gbr.fit(X_final, np.log(y_final))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=1,
             subsample=1.0, verbose=0, warm_start=False)

In [71]:
yhat_gbr_2 = gbr.predict(X_test_new)

In [72]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_gbr_2)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_gbr_2))

r2: 0.800
mape: 15.765
