### Feature Engineering

#### Import Libraries

In [3]:
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import OneHotEncoder, StandardScaler


#### Load dataset

In [13]:
df = pd.read_csv('data/processed/pre_processed_data.csv')
print(f"Dataset Shape : {df.shape}")
display(df.head())


Dataset Shape : (5000, 17)


Unnamed: 0,Post_Date,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Month,DayOfWeek,Is_Weekend,Likes_Per_View,Shares_Per_View,Comments_Per_View,Engagement_Rate
0,2022-01-13,TikTok,#Challenge,Video,UK,4163464,339431,53135,19346,High,1,3,0,0.081526,0.012762,0.004647,9.893493
1,2022-05-13,Instagram,#Education,Shorts,India,4155940,215240,65860,27239,Medium,5,4,0,0.051791,0.015847,0.006554,7.419236
2,2022-01-07,Twitter,#Challenge,Video,Brazil,3666211,327143,39423,36223,Medium,1,4,0,0.089232,0.010753,0.00988,10.98652
3,2022-12-05,YouTube,#Education,Shorts,Australia,917951,127125,11687,36806,Low,12,0,0,0.138488,0.012732,0.040096,19.131522
4,2023-03-23,TikTok,#Dance,Post,Brazil,64866,171361,69581,6376,Medium,3,3,0,2.641769,1.072688,0.098295,381.275244


Date Features

In [14]:
if 'Post_Date' in df.columns:
    df['Post_Date'] = pd.to_datetime(df['Post_Date'])
    print("\nConverted Post_Date to datetime objects.")


Converted Post_Date to datetime objects.


In [17]:
df['Month'] = df['Post_Date'].dt.month
df['DayOfWeek'] = df['Post_Date'].dt.dayofweek
df['Is_Weekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
print("Added Date Features: Month, DayOfWeek, Is_Weekend")

Added Date Features: Month, DayOfWeek, Is_Weekend


Interaction Features


In [18]:
# Avoid division by zero by adding a small epsilon if needed, though Views should be > 0
df['Likes_Per_View'] = df['Likes'] / df['Views']
df['Shares_Per_View'] = df['Shares'] / df['Views']
df['Comments_Per_View'] = df['Comments'] / df['Views']
df['Engagement_Rate'] = ((df['Likes'] + df['Shares'] + df['Comments']) / df['Views']) * 100
print("Added Interaction Features: Likes_Per_View, Shares_Per_View, Comments_Per_View, Engagement_Rate")


Added Interaction Features: Likes_Per_View, Shares_Per_View, Comments_Per_View, Engagement_Rate


Categorical Encoding (One-Hot Encoding)

In [19]:
# We use One-Hot Encoding for nominal variables to avoid ordinality assumptions
categorical_cols = ['Platform', 'Hashtag', 'Content_Type', 'Region']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print(f"\nDataframe shape after One-Hot Encoding: {df_encoded.shape}")



Dataframe shape after One-Hot Encoding: (5000, 37)


Encode Target Variable for Classification (Engagement_Level)

In [20]:

# Redefine Engagement_Level based on Engagement_Rate quantiles to ensure consistency and learnability
# Low: Bottom 33%, Medium: Middle 33%, High: Top 33%
df_encoded['Engagement_Level_Encoded'] = pd.qcut(df_encoded['Engagement_Rate'], q=3, labels=[0, 1, 2])
print("Redefined Engagement_Level based on Engagement_Rate quantiles (0: Low, 1: Medium, 2: High).")

Redefined Engagement_Level based on Engagement_Rate quantiles (0: Low, 1: Medium, 2: High).
