In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Read/Load dataset
data_df = pd.read_csv('audience_engagement_data.csv')
#View some of data
data_df.head(3)

Unnamed: 0,user_id,gender,age,location,likes,comments,shares,clicks,views,time_spent,device,referral_source,followers
0,1,Female,25-34,New York,100,20,10,50,500,30.0,Mobile,Instagram,1000.0
1,2,Male,35-44,Los Angeles,200,30,15,100,1000,45.0,Desktop,Google,2000.0
2,3,Female,18-24,Chicago,50,10,5,20,200,20.0,Mobile,Facebook,500.0


### PreProcessing

In [3]:
#Check missing (vis)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   user_id          149 non-null    int64  
 1   gender           149 non-null    object 
 2   age              149 non-null    object 
 3   location         149 non-null    object 
 4   likes            149 non-null    int64  
 5   comments         149 non-null    int64  
 6   shares           149 non-null    int64  
 7   clicks           149 non-null    int64  
 8   views            149 non-null    int64  
 9   time_spent       148 non-null    float64
 10  device           148 non-null    object 
 11  referral_source  148 non-null    object 
 12  followers        148 non-null    float64
dtypes: float64(2), int64(6), object(5)
memory usage: 15.3+ KB


In [4]:
#handle missing values
missing_values = data_df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 user_id            0
gender             0
age                0
location           0
likes              0
comments           0
shares             0
clicks             0
views              0
time_spent         1
device             1
referral_source    1
followers          1
dtype: int64


In [5]:
#Remove missing values
data_df.dropna(inplace=True)

In [6]:
#Check duplicates
# Remove duplicate records
data_df.duplicated().sum()


8

In [7]:
data_df.drop_duplicates(inplace=True) #Remove duplicate records

In [8]:
#Features to use: 'gender', 'age', 'location', 'views', and 'time_spent' -predictors
#Target(Label) : 'engagement_rate' - calculated as => 'likes' / 'followers'

#Feature engineering
data_df['engagement_rate'] = data_df['likes'] / data_df['followers']
data_df.head(3)

Unnamed: 0,user_id,gender,age,location,likes,comments,shares,clicks,views,time_spent,device,referral_source,followers,engagement_rate
0,1,Female,25-34,New York,100,20,10,50,500,30.0,Mobile,Instagram,1000.0,0.1
1,2,Male,35-44,Los Angeles,200,30,15,100,1000,45.0,Desktop,Google,2000.0,0.1
2,3,Female,18-24,Chicago,50,10,5,20,200,20.0,Mobile,Facebook,500.0,0.1


In [9]:
#Separate Features and Label

features = data_df.iloc[:,[1,2,3,8,9]]
label = data_df.iloc[:,13]

In [10]:
features.head()

Unnamed: 0,gender,age,location,views,time_spent
0,Female,25-34,New York,500,30.0
1,Male,35-44,Los Angeles,1000,45.0
2,Female,18-24,Chicago,200,20.0
3,Male,45-54,San Francisco,800,40.0
4,Female,25-34,Houston,400,25.0


In [11]:
#Handle Categorical Data
# Make the dataset compatible for ML algo
finalFeatures = pd.get_dummies(features)

In [12]:
finalFeatures.head()

Unnamed: 0,views,time_spent,gender_Female,gender_Male,age_18-24,age_25-34,age_35-44,age_45-54,location_Albuquerque,location_Anchorage,...,location_Shreveport,location_Sioux Falls,location_Spokane,location_Springfield,location_Tallahassee,location_Tampa,location_Tucson,location_Tulsa,location_Washington D.C.,location_Wichita
0,500,30.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000,45.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,200,20.0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,800,40.0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,400,25.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
#Get the Good RS
import warnings
warnings.filterwarnings("ignore")

CL = 0.9
#import traintest split and model(linearReg)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

for rs in range(1,11):
  X_train,X_test,y_train,y_test = train_test_split(finalFeatures,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=rs)

  model = LinearRegression()
  model.fit(X_train,y_train)

  testScore = model.score(X_test,y_test)
  trainScore = model.score(X_train,y_train)

  if testScore >= trainScore and testScore >= CL:
    print("Test Score is {} and Train Score is {} in RS value {}".format(testScore,trainScore,rs))

Test Score is 1.0 and Train Score is 1.0 in RS value 1
Test Score is 1.0 and Train Score is 1.0 in RS value 2
Test Score is 1.0 and Train Score is 1.0 in RS value 3
Test Score is 1.0 and Train Score is 1.0 in RS value 4
Test Score is 1.0 and Train Score is 1.0 in RS value 5
Test Score is 1.0 and Train Score is 1.0 in RS value 6
Test Score is 1.0 and Train Score is 1.0 in RS value 7
Test Score is 1.0 and Train Score is 1.0 in RS value 8
Test Score is 1.0 and Train Score is 1.0 in RS value 9
Test Score is 1.0 and Train Score is 1.0 in RS value 10


In [14]:
#Since any state produces a score of 100 let's go with RS 10
X_train,X_test,y_train,y_test = train_test_split(finalFeatures,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=10)

model = LinearRegression()
model.fit(X_train,y_train)

testScore = model.score(X_test,y_test)
trainScore = model.score(X_train,y_train)

print("Test Score is {} and Train Score is {}".format(testScore,trainScore))

Test Score is 1.0 and Train Score is 1.0


In [15]:
finalFeatures.head(3)

Unnamed: 0,views,time_spent,gender_Female,gender_Male,age_18-24,age_25-34,age_35-44,age_45-54,location_Albuquerque,location_Anchorage,...,location_Shreveport,location_Sioux Falls,location_Spokane,location_Springfield,location_Tallahassee,location_Tampa,location_Tucson,location_Tulsa,location_Washington D.C.,location_Wichita
0,500,30.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000,45.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,200,20.0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Make predictions
y_pred = model.predict(X_test)
y_pred

array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       0.1, 0.1])

In [17]:
from sklearn.metrics import mean_squared_error
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0


Model looks ok now deploy

In [21]:
#Packing model
import pickle
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
pickle.dump(model,open('modelEnagementPredictor.pkl','wb'))
pickle.dump(ohe,open('stateEncoderDecoder.pkl','wb'))


In [22]:
#Unpack model
modelEnagement = pickle.load(open('modelEnagementPredictor.pkl','rb'))
ohe = pickle.load(open('stateEncoderDecoder.pkl','rb'))

In [23]:
#Deploy test 
#Input predictors: 'gender', 'age', 'location', 'views', 'time_spent'

try:
    gender = input("Enter Gender(Male/Female): ")
    age= input("Enter Age: ")
    location= input("Enter Location: ")
    views = int(input("Enter Views: "))
    timeSpent= float(input("Enter Time Spent:"))
except ValueError:
    print("Invalid Input Try again")


Enter Gender(Male/Female): Female
Enter Age: 18-24
Enter Location: Chicago
Enter Views: 4000
Enter Time Spent:30


In [24]:
#Encoding using OHE
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)


In [25]:
#Make Location ML Algo ready
fLocation = ohe.fit_transform(pd.DataFrame(features.iloc[:,2]))
encodedLoc=ohe.transform(np.array([[location]]))
encodedLoc

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [26]:
#Make Gender ML Algo ready
fGender = ohe.fit_transform(pd.DataFrame(features.iloc[:,0]))
encodedGen = ohe.transform(np.array([[gender]]))
encodedGen

array([[1., 0.]])

In [27]:
#Make Age ML Algo ready
fAge = ohe.fit_transform(pd.DataFrame(features.iloc[:,1]))
encodedAge = ohe.transform(np.array([[age]]))
encodedAge

array([[1., 0., 0., 0.]])

In [28]:
#combine predictors
finalPredictors = np.concatenate((np.array([[views,timeSpent]]),encodedGen,encodedAge,encodedLoc),axis=1)
finalPredictors

array([[4.e+03, 3.e+01, 1.e+00, 0.e+00, 1.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 1.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00,
        0.e+00]])

In [34]:
predictEngagement = modelEnagement.predict(finalPredictors)
print("Predicted Enagement is {}%".format(predictEngagement[0]*100))

Predicted Enagement is 10.0%
