# Facebook Cosmetics Page Interactions

In [1]:
%matplotlib notebook

import os
import json
import warnings
import time
import pickle
import requests
import zipfile


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

# Get data
Only run it when you haven't downloaded the data

In [102]:
URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/00368/Facebook_metrics.zip"

def fetch_data(fname='Facebook_metrics.zip'):
    """
    Helper method to retreive the ML Repository dataset.
    """
    response = requests.get(URL)
    outpath  = os.path.abspath(fname)
    with open(outpath, 'wb') as f:
        f.write(response.content)
    
    return outpath

# Fetch the data if required
DATA = fetch_data()

In [103]:
#unzip
with zipfile.ZipFile(DATA, 'r') as zip_ref:
    zip_ref.extractall()

# Data exploration
If you already have the data, starts here

In [7]:
#unzip filename is dataset_Facebook.csv 
df = pd.read_csv('dataset_Facebook.csv', ';')

In [124]:
df.columns

Index(['Page total likes', 'Type', 'Category', 'Post Month', 'Post Weekday',
       'Post Hour', 'Paid', 'Lifetime Post Total Reach',
       'Lifetime Post Total Impressions', 'Lifetime Engaged Users',
       'Lifetime Post Consumers', 'Lifetime Post Consumptions',
       'Lifetime Post Impressions by people who have liked your Page',
       'Lifetime Post reach by people who like your Page',
       'Lifetime People who have liked your Page and engaged with your post',
       'comment', 'like', 'share', 'Total Interactions'],
      dtype='object')

In [125]:
df.shape

(500, 19)

In [126]:
df.head()

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
0,139441,Photo,2,12,4,3,0.0,2752,5091,178,109,159,3078,1640,119,4,79.0,17.0,100
1,139441,Status,2,12,3,10,0.0,10460,19057,1457,1361,1674,11710,6112,1108,5,130.0,29.0,164
2,139441,Photo,3,12,3,3,0.0,2413,4373,177,113,154,2812,1503,132,0,66.0,14.0,80
3,139441,Photo,2,12,2,10,1.0,50128,87991,2211,790,1119,61027,32048,1386,58,1572.0,147.0,1777
4,139441,Photo,2,12,2,3,0.0,7244,13594,671,410,580,6228,3200,396,19,325.0,49.0,393


In [127]:
df.groupby('Type')['Type'].count()

Type
Link       22
Photo     426
Status     45
Video       7
Name: Type, dtype: int64

In [18]:
#check null
df.groupby('comment')['comment'].count()
#df['Paid'].isna().sum()

comment
0      106
1       62
2       71
3       36
4       44
5       20
6       26
7       20
8        8
9       15
10      11
11       8
12       7
13       4
14       3
15       2
16       5
17       4
18       6
19       3
20       5
21       1
22       2
23       1
24       2
25       3
26       2
29       1
30       2
33       2
36       2
37       1
38       1
41       1
42       1
45       2
47       1
51       1
56       1
58       1
60       1
64       1
103      1
144      1
146      1
372      1
Name: comment, dtype: int64

# Fill Null and Hot Encoding

In [8]:
#fill null
df['Paid'] = df['Paid'].fillna(value=0)

In [9]:
# Get one hot encoding of column
one_hot = pd.get_dummies(df['Type'])
# Drop original column as it is now encoded
df = df.drop('Type',axis = 1)
# Join the encoded df
df = df.join(one_hot)
df.head(2)

Unnamed: 0,Page total likes,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,...,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions,Link,Photo,Status,Video
0,139441,2,12,4,3,0.0,2752,5091,178,109,...,1640,119,4,79.0,17.0,100,0,1,0,0
1,139441,2,12,3,10,0.0,10460,19057,1457,1361,...,6112,1108,5,130.0,29.0,164,0,0,1,0


# Model

In [24]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#use later
from sklearn.model_selection import cross_val_score

# Ridge Regression

In [34]:
#exclude for now
X = df[['Link', 'Photo', 'Status', 'Video', 'Category', 'Post Month', 'Post Weekday', 'Post Hour', 'Paid']]

y = df['comment']

# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize our algorithm
rr = Ridge(alpha=0.01)
rr.fit(X_train, y_train) 

pred_train_rr= rr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rr)))
print(r2_score(y_train, pred_train_rr))


23.17676635502205
0.013937177726504713


In [29]:
pred_test_rr= rr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rr))) 
print(r2_score(y_test, pred_test_rr))

7.611782471497953
-0.12071616130099794
