In [1]:
import pandas as pd
pd.set_option('max_colwidth', 80)

### Merging multiple csvs

In [2]:
import glob
df = pd.DataFrame()
for file in glob.glob("*ig.csv"):
    brand = file.split("_ig.csv")[0]
    temp_df = pd.read_csv(file)
    if brand == "samsung":
        brand = "samsungmobile"
    temp_df["brand"] = brand
    df = pd.concat([df,temp_df], axis=0)

In [3]:
def handle_numbers(string):
    string = str(string).replace(",","")
    if "k" in string:
        if "." in string:
            return int(string.replace("k","00").replace(".",""))
        else:
            return int(string.replace("k","00"))
    if "m" in string:
        if "." in string:
            return int(string.replace("m","00000").replace(".",""))
        else:
            return int(string.replace("m","000000"))
    return int(float(string))
        

In [4]:
# only choose images
df = df[df.media=="image"]

In [5]:
df.likes = df.likes.apply(handle_numbers)

In [6]:
df = df[df.caption.notnull()]

### Feature Engineering

In [7]:
from sklearn.pipeline import make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

In [8]:
def get_text(df):
    return df.caption

In [9]:
get_text_ft = FunctionTransformer(get_text, validate=False)

Get page meta data

In [10]:
meta_df = pd.read_csv("brand_ig_data.csv")

In [11]:
meta_df = meta_df.loc[:, ["brand_ig","n_posts", "n_followers","n_following"]]

In [12]:
df.rename(index=str, columns={"brand": "brand_ig"}, inplace=True)

In [13]:
final_df = pd.merge(df,meta_df,how="left",on="brand_ig")

In [14]:
final_df["n_posts"] = final_df.n_posts.apply(handle_numbers)

In [15]:
final_df["n_followers"] = final_df.n_followers.apply(handle_numbers)

In [16]:
# Remove posts with zero likes (to take a log transform and calculate MAPE)
final_df = final_df[final_df.likes != 0]

In [17]:
def get_brand_meta(df):
    return df.loc[:,["n_posts", "n_followers"]]

In [18]:
get_brand_meta_ft = FunctionTransformer(get_brand_meta, validate=False)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english', max_df=0.3, min_df=4)

In [20]:
union = make_union(make_pipeline(get_text_ft, vect), get_brand_meta_ft)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_df, final_df.likes, random_state=1)

In [22]:
X_train_meta_dtm = union.fit_transform(X_train)

In [23]:
X_test_meta_dtm = union.transform(X_test)

### Model Selection

In [24]:
import sklearn.linear_model
import sklearn.datasets
import sklearn.svm
from sklearn.metrics import r2_score
import sklearn.feature_extraction.text
import sklearn.utils.sparsefuncs

In [25]:
import numpy as np

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred)/ y_true)) * 100

Standard LR

In [26]:
regression = sklearn.linear_model.LinearRegression()
regression.fit(X_train_meta_dtm, np.log(y_train))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
yhat_lr = regression.predict(X_test_meta_dtm)

In [28]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_lr)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_lr))

r2: 0.667
mape: 20.414


lasso, L1 penalized regression

In [29]:
lasso = sklearn.linear_model.Lasso(alpha = 0.1, max_iter=3000)

In [30]:
lasso.fit(X_train_meta_dtm,np.log(y_train))

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=3000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [31]:
yhat_lasso = lasso.predict(X_test_meta_dtm)

In [32]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_lasso)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_lasso))

r2: 0.322
mape: 32.316


Adaboost

In [33]:
from sklearn.ensemble import AdaBoostRegressor

In [34]:
ada = AdaBoostRegressor(base_estimator=None, n_estimators=200, learning_rate=1, loss="linear", random_state=1)

In [35]:
ada.fit(X_train_meta_dtm, np.log(y_train))

AdaBoostRegressor(base_estimator=None, learning_rate=1, loss='linear',
         n_estimators=200, random_state=1)

In [36]:
yhat_ada = ada.predict(X_test_meta_dtm)

In [37]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_ada)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_ada))

r2: 0.639
mape: 21.122


Gradient Boosted Trees

In [38]:
from sklearn.ensemble import GradientBoostingRegressor

In [39]:
gbr = GradientBoostingRegressor(loss="ls", learning_rate=0.1, n_estimators=100, subsample=1.0, criterion="friedman_mse", min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=1, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort="auto")

In [40]:
gbr.fit(X_train_meta_dtm, np.log(y_train))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=1,
             subsample=1.0, verbose=0, warm_start=False)

In [41]:
yhat_gbr = gbr.predict(X_test_meta_dtm)

In [42]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_gbr)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_gbr))

r2: 0.777
mape: 15.889


### Model Interpretation

In [43]:
threshold = 1.0e-03

In [44]:
# Ignore the long tail
gbr.feature_importances_[gbr.feature_importances_ >= threshold].sum()

0.9757797822387613

In [45]:
impt_feats = gbr.feature_importances_ >= threshold

In [46]:
impt_ind = np.where(impt_feats[:-2])[0]

In [47]:
np.array(vect.get_feature_names())[impt_feats[:-2]]

array(['777tour', 'acer', 'acerwin10', 'available', 'awesome',
       'badgalriri', 'bea1', 'bio', 'capture', 'capturedonp9', 'ces',
       'ces2014', 'coming', 'cute', 'digs', 'domore', 'elifee6',
       'featured', 'film', 'find5', 'flipkart', 'flyme', 'fullvision',
       'g2', 'gioneea1plus', 'gioneee7', 'gioneeshutterbugs',
       'gioneestargima', 'gioneesunburn', 'giveme5',
       'guildawardswithgionee', 'guys', 'gwatchr', 'hewlettpackard', 'hp',
       'htcgoesfullfrontal', 'htcone', 'htcrihanna', 'huaweimate10',
       'huaweip10', 'huaweip8', 'ifa15', 'introducing', 'keepasking',
       'lenovo', 'lgdailygram', 'lgg6', 'lgv30', 'lgv30sthinq', 'lounge',
       'makesmiles', 'meizu', 'mi', 'moto360', 'motox', 'motoxmade',
       'music', 'mymotox', 'mysuperg', 'new', 'nyc', 'oneplus5t',
       'oppofind5', 'oppon1', 'optimus', 'p7max', 'perfectshot',
       'pursuit', 'raghu', 'rai', 'redminote3', 'repost', 'selfieexpert',
       'selfieflash', 'selfiestan', 'share', 'shotonon

In [48]:
vect.fit(X_train.caption)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=None, min_df=4,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [49]:
gbr.feature_importances_[impt_feats].size

94

In [50]:
importance_df = pd.DataFrame()

In [51]:
importance_df["word"] = np.array(vect.get_feature_names())[impt_feats[:-2]]

In [52]:
importance_df["importance"] = gbr.feature_importances_[impt_feats][:-2]

In [53]:
importance_df = importance_df.sort_values(by="importance",ascending=False)

In [54]:
meta_importance_df = pd.DataFrame([["n_followers",0.24085896],["n_posts",0.08033263,]],columns=["word","importance"])

In [55]:
pd.concat([meta_importance_df,importance_df],axis = 0).reset_index(drop=True).head(20)

Unnamed: 0,word,importance
0,n_followers,0.240859
1,n_posts,0.080333
2,meizu,0.032434
3,gioneee7,0.023701
4,mi,0.020847
5,gioneestargima,0.020108
6,xiaomipics,0.019608
7,shotononeplus,0.018508
8,htcgoesfullfrontal,0.017214
9,moto360,0.016142


### Select only useful features and use all data in final fitting

In [56]:
from sklearn.feature_selection import SelectFromModel

In [57]:
model = SelectFromModel(gbr, prefit=True)
X_train_new = model.transform(X_train_meta_dtm)
X_test_new = model.transform(X_test_meta_dtm)

In [58]:
X_final = np.vstack([X_train_new.toarray(),X_test_new.toarray()])
y_final = np.concatenate([y_train.values, y_test.values])

In [59]:
gbr.fit(X_final, np.log(y_final))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=1,
             subsample=1.0, verbose=0, warm_start=False)

In [60]:
yhat_gbr_2 = gbr.predict(X_test_new)

In [61]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_gbr_2)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_gbr_2))

r2: 0.780
mape: 15.812


### Pickle file for later use

In [62]:
import pickle

In [67]:
with open("instagram_model.pkl","wb") as f:
    pickle.dump(gbr, f)

### Unpickling

In [68]:
with open("instagram_model.pkl","rb") as f:
    gbr_insta = pickle.load(f)

In [69]:
gbr_insta.predict(X_test_new)

array([4.00077137, 4.02276018, 6.927449  , ..., 7.31613214, 9.26360863,
       9.25397389])

### Facebook Data

In [70]:
fb_df = pd.read_csv("fb_final.csv")

In [86]:
fb_df.page_followers = fb_df.page_followers.apply(lambda x: int(x.replace(",","")))

In [87]:
fb_df.page_likes = fb_df.page_likes.apply(lambda x: int(x.replace(",","")))

In [95]:
#only dropping 2 rows
fb_df = fb_df[fb_df.post_likes != 0]

In [71]:
fb_df

Unnamed: 0,brand,post_comments,created_time,post_likes,post_message,post_shares,post_type,page_likes,page_followers
0,Alcatel,0,2016-03-28T06:42:37+0000,2,"Sometimes you gotta stop to smell the flowers… and snap a pic. Take a break,...",1,photo,8742189,8742177
1,Alcatel,2,2016-03-17T06:24:16+0000,15,Enjoy super fast LTE CAT 6 speed and a long lasting 3600mAh battery with pow...,6,photo,8742189,8742177
2,Alcatel,1,2016-03-09T04:23:38+0000,41,"Dustproof, shockproof, waterproof smartwatch, ALCATEL GO Watch is ready for ...",2,photo,8742189,8742177
3,Alcatel,0,2016-02-19T08:26:34+0000,70,Check out the greatness of ALCATEL ONETOUCH Flash2 13megapixels camera at ma...,4,photo,8742189,8742177
4,Alcatel,0,2016-01-29T08:54:45+0000,2,ALCATEL ONETOUCH Recognized Twice in the 2015-2016 Global Top Brands.,1,link,8742189,8742177
5,Alcatel,3,2016-01-28T06:16:25+0000,22,Enter this code MONKEY18 at checkout to enjoy the discount. Get your ALCATEL...,5,photo,8742189,8742177
6,Alcatel,1,2016-01-26T10:28:39+0000,6,No excuse to miss Flash 2. We are giving $18 discount for each purchase. Hur...,1,photo,8742189,8742177
7,Alcatel,0,2016-01-19T09:52:15+0000,8,Just WATCH.\n\n#alcatelsg #smartwatch #fitnesstracker #nofilter #affordable ...,1,photo,8742189,8742177
8,Alcatel,0,2016-01-19T04:13:59+0000,10,"10 Reasons to love ALCATEL FLASH2!\n\nThank you, William Tan",2,link,8742189,8742177
9,Alcatel,0,2016-01-13T09:03:38+0000,11,Congratulations to all winners and thanks for participating.\n\n#alcatelsg ...,1,photo,8742189,8742177


In [72]:
def get_text_fb(df):
    return df.post_message

In [73]:
get_text_fb_ft = FunctionTransformer(get_text_fb, validate=False)

Get page meta data

In [155]:
def get_brand_meta_fb(df):
    return df.loc[:,["page_likes", "page_followers"]]

In [156]:
get_brand_meta_fb_ft = FunctionTransformer(get_brand_meta_fb, validate=False)

In [157]:
from sklearn.feature_extraction.text import CountVectorizer
vect_fb = CountVectorizer(stop_words='english', max_df=0.3, min_df=4)

In [158]:
union_fb = make_union(make_pipeline(get_text_fb_ft, vect_fb), get_brand_meta_fb_ft)

In [159]:
from sklearn.model_selection import train_test_split
X_train_fb, X_test_fb, y_train_fb, y_test_fb = train_test_split(fb_df, fb_df.post_likes, random_state=1)

In [160]:
X_train_meta_dtm_fb = union_fb.fit_transform(X_train_fb)

In [161]:
X_test_meta_dtm_fb = union_fb.transform(X_test_fb)

In [162]:
gbr_fb = GradientBoostingRegressor(loss="ls", learning_rate=0.1, n_estimators=100, subsample=1.0, criterion="friedman_mse", min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=1, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort="auto")

In [163]:
gbr_fb.fit(X_train_meta_dtm_fb, y_train_fb)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=1,
             subsample=1.0, verbose=0, warm_start=False)

In [164]:
yhat_gbr_fb = gbr_fb.predict(X_test_meta_dtm_fb)

In [165]:
print("r2: %.3f"%(r2_score(y_test_fb,yhat_gbr_fb)))
print("mape: %.3f"%mean_absolute_percentage_error(y_test_fb,yhat_gbr_fb))

r2: 0.244
mape: 2625.239


In [166]:
fb_df.post_likes.var()

329202269.5944702

In [167]:
final_df.likes.var()

126766738.09358193

In [168]:
fb_threshold = 1.0e-03

In [169]:
# Ignore the long tail
gbr_fb.feature_importances_[gbr_fb.feature_importances_ >= fb_threshold].sum()

0.9853994108953903

In [174]:
impt_feats_fb = gbr_fb.feature_importances_ >= threshold

In [175]:
impt_ind_fb = np.where(impt_feats_fb[:-2])[0]

In [176]:
np.array(vect_fb.get_feature_names())[impt_feats_fb[:-2]]

array(['1km4ufn', '26th', '2co90vz', '2qyxppy', '2ulvhbz', '2wn5fap',
       '4444', '4599', '4g', 'according', 'advanced', 'answers',
       'arrived', 'artist', 'attached', 'awesome', 'bad', 'big', 'bigger',
       'bit', 'brand', 'clue', 'colors', 'complete', 'confident',
       'daydream', 'deals', 'display', 'dowhatyoucant', 'equipped',
       'eraonflipkart', 'experience', 'exquisite', 'flagship', 'flying',
       'fullvision', 'galaxynote8', 'galaxys7', 'galaxys7edge',
       'galaxys8', 'generation', 'gold', 'grab', 'hand', 'headset',
       'holds', 'ideas', 'immersive', 'india', 'initiative', 'k10',
       'latest', 'lets', 'lg', 'lgg6', 'lgq6', 'lgv30', 'lgv30sthinq',
       'lgxseries', 'lightweight', 'like', 'looks', 'love', 'ly',
       'matter', 'mimix2s', 'movie', 'multimedia', 'multiple', 'mwc',
       'mwc2017', 'mwcshanghai', 'new', 'newly', 'night', 'onwards',
       'peak', 'phone', 'photos', 'pictures', 'point', 'possibilities',
       'processors', 'purchase', 'q

In [177]:
vect_fb.fit(X_train_fb.post_message)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=None, min_df=4,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [178]:
gbr_fb.feature_importances_[impt_feats_fb].size

117

In [179]:
gbr_fb.feature_importances_

array([0.00026316, 0.        , 0.        , ..., 0.00476728, 0.0155235 ,
       0.01935506])

In [180]:
importance_df_fb = pd.DataFrame()

In [181]:
importance_df_fb["word"] = np.array(vect_fb.get_feature_names())[impt_feats_fb[:-2]]

In [182]:
importance_df_fb["importance"] = gbr_fb.feature_importances_[impt_feats_fb][:-2]

In [183]:
importance_df_fb = importance_df_fb.sort_values(by="importance",ascending=False)

In [184]:
meta_importance_df_fb = pd.DataFrame([["page_likes",0.00352126],["page_followers",0.01468663]],columns=["word","importance"])

In [130]:
pd.concat([meta_importance_df_fb,importance_df_fb],axis = 0).reset_index(drop=True).head(20)

Unnamed: 0,word,importance
0,page_likes,0.015524
1,page_followers,0.019355
2,s8smartswitch,0.082005
3,arrived,0.057717
4,galaxys8,0.031428
5,lgq6,0.027342
6,x10,0.024414
7,lgv30sthinq,0.024231
8,lgg6,0.023181
9,lg,0.023173


In [153]:
pd.concat([meta_importance_df_fb,importance_df_fb],axis = 0).reset_index(drop=True).head(20)

Unnamed: 0,word,importance
0,page_likes,0.003521
1,page_followers,0.014687
2,post_shares,0.179242
3,s8smartswitch,0.082005
4,arrived,0.057717
5,galaxys8,0.031428
6,lgq6,0.027342
7,x10,0.024414
8,lgv30sthinq,0.024231
9,lgg6,0.023181


In [185]:
with open("facebook_model.pkl","wb") as f:
    pickle.dump(gbr_fb, f)