In [None]:
import pandas as pd
pd.set_option('max_colwidth', 80)

### Merging multiple csvs

In [None]:
import glob
df = pd.DataFrame()
for file in glob.glob("*ig.csv"):
    brand = file.split("_ig.csv")[0]
    temp_df = pd.read_csv(file)
    if brand == "samsung":
        brand = "samsungmobile"
    temp_df["brand"] = brand
    df = pd.concat([df,temp_df], axis=0)

In [None]:
def handle_numbers(string):
    string = str(string).replace(",","")
    if "k" in string:
        if "." in string:
            return int(string.replace("k","00").replace(".",""))
        else:
            return int(string.replace("k","00"))
    if "m" in string:
        if "." in string:
            return int(string.replace("m","00000").replace(".",""))
        else:
            return int(string.replace("m","000000"))
    return int(float(string))
        

In [None]:
# only choose images
df = df[df.media=="image"]

In [None]:
df.likes = df.likes.apply(handle_numbers)

In [None]:
df = df[df.caption.notnull()]

### Feature Engineering

In [None]:
from sklearn.pipeline import make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

In [None]:
def get_text(df):
    return df.caption

In [None]:
get_text_ft = FunctionTransformer(get_text, validate=False)

Get page meta data

In [None]:
meta_df = pd.read_csv("brand_ig_data.csv")

In [None]:
meta_df = meta_df.loc[:, ["brand_ig","n_posts", "n_followers","n_following"]]

In [None]:
df.rename(index=str, columns={"brand": "brand_ig"}, inplace=True)

In [None]:
final_df = pd.merge(df,meta_df,how="left",on="brand_ig")

In [None]:
final_df["n_posts"] = final_df.n_posts.apply(handle_numbers)

In [None]:
final_df["n_followers"] = final_df.n_followers.apply(handle_numbers)

In [None]:
# Remove posts with zero likes (to take a log transform and calculate MAPE)
final_df = final_df[final_df.likes != 0]

In [None]:
def get_brand_meta(df):
    return df.loc[:,["n_posts", "n_followers"]]

In [None]:
get_brand_meta_ft = FunctionTransformer(get_brand_meta, validate=False)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english', max_df=0.3, min_df=4)

In [None]:
union = make_union(make_pipeline(get_text_ft, vect), get_brand_meta_ft)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_df, final_df.likes, random_state=1)

In [None]:
X_train_meta_dtm = union.fit_transform(X_train)

In [None]:
X_test_meta_dtm = union.transform(X_test)

### Model Selection

In [None]:
import sklearn.linear_model
import sklearn.datasets
import sklearn.svm
from sklearn.metrics import r2_score
import sklearn.feature_extraction.text
import sklearn.utils.sparsefuncs

In [None]:
import numpy as np

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred)/ y_true)) * 100

Standard LR

In [None]:
regression = sklearn.linear_model.LinearRegression()
regression.fit(X_train_meta_dtm, np.log(y_train))

In [None]:
yhat_lr = regression.predict(X_test_meta_dtm)

In [None]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_lr)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_lr))

lasso, L1 penalized regression

In [None]:
lasso = sklearn.linear_model.Lasso(alpha = 0.1, max_iter=3000)

In [None]:
lasso.fit(X_train_meta_dtm,np.log(y_train))

In [None]:
yhat_lasso = lasso.predict(X_test_meta_dtm)

In [None]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_lasso)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_lasso))

Adaboost

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
ada = AdaBoostRegressor(base_estimator=None, n_estimators=200, learning_rate=1, loss="linear", random_state=1)

In [None]:
ada.fit(X_train_meta_dtm, np.log(y_train))

In [None]:
yhat_ada = ada.predict(X_test_meta_dtm)

In [None]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_ada)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_ada))

Gradient Boosted Trees

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor(loss="ls", learning_rate=0.1, n_estimators=100, subsample=1.0, criterion="friedman_mse", min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=1, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort="auto")

In [None]:
gbr.fit(X_train_meta_dtm, np.log(y_train))

In [None]:
yhat_gbr = gbr.predict(X_test_meta_dtm)

In [None]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_gbr)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_gbr))

### Model Interpretation

In [None]:
threshold = 1.0e-03

In [None]:
# Ignore the long tail
gbr.feature_importances_[gbr.feature_importances_ >= threshold].sum()

In [None]:
impt_feats = gbr.feature_importances_ >= threshold

In [None]:
impt_ind = np.where(impt_feats[:-2])[0]

In [None]:
np.array(vect.get_feature_names())[impt_feats[:-2]]

In [None]:
vect.fit(X_train.caption)

In [None]:
gbr.feature_importances_[impt_feats].size

In [None]:
importance_df = pd.DataFrame()

In [None]:
importance_df["word"] = np.array(vect.get_feature_names())[impt_feats[:-2]]

In [None]:
importance_df["importance"] = gbr.feature_importances_[impt_feats][:-2]

In [None]:
importance_df = importance_df.sort_values(by="importance",ascending=False)

In [None]:
meta_importance_df = pd.DataFrame([["n_followers",0.24085896],["n_posts",0.08033263,]],columns=["word","importance"])

In [None]:
pd.concat([meta_importance_df,importance_df],axis = 0).reset_index(drop=True).head(20)

### Select only useful features and use all data in final fitting

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
# model = SelectFromModel(gbr, prefit=True)
# X_train_new = model.transform(X_train_meta_dtm)
# X_test_new = model.transform(X_test_meta_dtm)

In [None]:
X_final = np.vstack([X_train_new.toarray(),X_test_new.toarray()])
y_final = np.concatenate([y_train.values, y_test.values])

In [None]:
gbr.fit(X_final, np.log(y_final))

In [None]:
yhat_gbr_2 = gbr.predict(X_test_new)

In [None]:
print("r2: %.3f"%(r2_score(np.log(y_test),yhat_gbr_2)))
print("mape: %.3f"%mean_absolute_percentage_error(np.log(y_test),yhat_gbr_2))

### Creating a new entry

In [None]:
final_df.iloc[600]

In [None]:
 test_caption = final_df.iloc[600].caption

In [None]:
def construct_test_df(test_caption, brand_posts, brand_followers):
    test_df = pd.DataFrame([[test_caption,brand_posts,brand_followers]],columns=["caption","n_posts","n_followers"])
    return test_df

In [None]:
construct_test_df(test_caption,152,4700000)

In [None]:
test_caption = 'Moments are the best souvenirs you can collect. #GalaxyS9plus 📷: @inescostamonteiro'

In [None]:
test1 = union.transform(construct_test_df(test_caption,274,4200000))

In [None]:
gbr.predict(test1)



### Pickle file for later use

In [1]:
import pickle

In [None]:
with open("instagram_model.pkl","wb") as f:
    pickle.dump(gbr, f)

### Unpickling

In [None]:
with open("instagram_model.pkl","rb") as f:
    gbr_insta = pickle.load(f)

In [None]:
gbr_insta.predict(X_test_new)

### Facebook Data

In [None]:
fb_df = pd.read_csv("fb_final.csv")

In [None]:
fb_df.page_followers = fb_df.page_followers.apply(lambda x: int(x.replace(",","")))

In [None]:
fb_df.page_likes = fb_df.page_likes.apply(lambda x: int(x.replace(",","")))

In [None]:
#only dropping 2 rows
fb_df = fb_df[fb_df.post_likes != 0]

In [None]:
fb_df

In [None]:
def get_text_fb(df):
    return df.post_message

In [None]:
get_text_fb_ft = FunctionTransformer(get_text_fb, validate=False)

Get page meta data

In [None]:
def get_brand_meta_fb(df):
    return df.loc[:,["page_likes", "page_followers"]]

In [None]:
get_brand_meta_fb_ft = FunctionTransformer(get_brand_meta_fb, validate=False)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect_fb = CountVectorizer(stop_words='english', max_df=0.3, min_df=4)

In [None]:
union_fb = make_union(make_pipeline(get_text_fb_ft, vect_fb), get_brand_meta_fb_ft)

In [None]:
from sklearn.model_selection import train_test_split
X_train_fb, X_test_fb, y_train_fb, y_test_fb = train_test_split(fb_df, fb_df.post_likes, random_state=1)

In [None]:
X_train_meta_dtm_fb = union_fb.fit_transform(X_train_fb)

In [None]:
X_test_meta_dtm_fb = union_fb.transform(X_test_fb)

In [None]:
gbr_fb = GradientBoostingRegressor(loss="ls", learning_rate=0.1, n_estimators=100, subsample=1.0, criterion="friedman_mse", min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=1, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort="auto")

In [None]:
gbr_fb.fit(X_train_meta_dtm_fb, y_train_fb)

In [None]:
yhat_gbr_fb = gbr_fb.predict(X_test_meta_dtm_fb)

In [None]:
print("r2: %.3f"%(r2_score(y_test_fb,yhat_gbr_fb)))
print("mape: %.3f"%mean_absolute_percentage_error(y_test_fb,yhat_gbr_fb))

In [None]:
fb_df.post_likes.var()

In [None]:
final_df.likes.var()

In [None]:
fb_threshold = 1.0e-03

In [None]:
# Ignore the long tail
gbr_fb.feature_importances_[gbr_fb.feature_importances_ >= fb_threshold].sum()

In [None]:
impt_feats_fb = gbr_fb.feature_importances_ >= threshold

In [None]:
impt_ind_fb = np.where(impt_feats_fb[:-2])[0]

In [None]:
np.array(vect_fb.get_feature_names())[impt_feats_fb[:-2]]

In [None]:
vect_fb.fit(X_train_fb.post_message)

In [None]:
gbr_fb.feature_importances_[impt_feats_fb].size

In [None]:
gbr_fb.feature_importances_

In [None]:
importance_df_fb = pd.DataFrame()

In [None]:
importance_df_fb["word"] = np.array(vect_fb.get_feature_names())[impt_feats_fb[:-2]]

In [None]:
importance_df_fb["importance"] = gbr_fb.feature_importances_[impt_feats_fb][:-2]

In [None]:
importance_df_fb = importance_df_fb.sort_values(by="importance",ascending=False)

In [None]:
meta_importance_df_fb = pd.DataFrame([["page_likes",0.00352126],["page_followers",0.01468663]],columns=["word","importance"])

In [None]:
pd.concat([meta_importance_df_fb,importance_df_fb],axis = 0).reset_index(drop=True).head(20)

In [None]:
pd.concat([meta_importance_df_fb,importance_df_fb],axis = 0).reset_index(drop=True).head(20)

In [None]:
with open("facebook_model.pkl","wb") as f:
    pickle.dump(gbr_fb, f)