<a href="https://colab.research.google.com/github/mohitDhami87/YouTube_Shorts_Performance_Prediction_Case_Study/blob/main/YouTube_Shorts_Performance_Prediction_CaseStudy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **YouTube Shorts Performance Prediction Case Study**


---



The core challenge of this case study is to leverage Supervised Machine Learning to predict
the potential performance (specifically, the Engagement Rate tertile: Low, Medium, or High) of a
YouTube Short based on its intrinsic features (title, duration, category) and its publishing
behavior (upload hour). The final goal is to develop a reliable predictive model and deliver
actionable content strategy recommendations to maximize viral potential and channel
growth.


---



In [None]:
# Gdrive dataset link
# https://drive.google.com/file/d/1e4_e0JfuOdBPQodG_jaOvCJnEYMK4YNs/view?usp=drive_link

In [2]:
# Importing Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Loading dataset

dataset_url_path = 'https://drive.google.com/uc?export=download&id=1e4_e0JfuOdBPQodG_jaOvCJnEYMK4YNs'
df = pd.read_csv(dataset_url_path)
df.head()

Unnamed: 0,video_id,title,duration_sec,hashtags_count,views,likes,comments,shares,upload_hour,category
0,vid_1000,Short Video #0,43,9,198775,21933,3228,400,8,Tech
1,vid_1001,Short Video #1,56,2,290336,20063,3719,1942,16,Comedy
2,vid_1002,Short Video #2,33,6,264206,37032,3228,1817,7,Food
3,vid_1003,Short Video #3,19,9,85076,27269,2371,980,1,Lifestyle
4,vid_1004,Short Video #4,47,8,90780,8041,2891,1109,23,Tech


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   video_id        300 non-null    object
 1   title           300 non-null    object
 2   duration_sec    300 non-null    int64 
 3   hashtags_count  300 non-null    int64 
 4   views           300 non-null    int64 
 5   likes           300 non-null    int64 
 6   comments        300 non-null    int64 
 7   shares          300 non-null    int64 
 8   upload_hour     300 non-null    int64 
 9   category        300 non-null    object
dtypes: int64(7), object(3)
memory usage: 23.6+ KB


In [5]:
df.describe()

Unnamed: 0,duration_sec,hashtags_count,views,likes,comments,shares,upload_hour
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,31.73,4.343333,245058.166667,23063.126667,2638.296667,999.576667,11.543333
std,16.042912,2.956562,141338.982093,14597.45822,1416.827287,564.410105,6.814414
min,5.0,0.0,1404.0,109.0,40.0,2.0,0.0
25%,18.0,2.0,129620.5,10342.0,1445.75,516.25,6.0
50%,32.0,4.0,255962.0,21779.5,2863.5,988.0,12.0
75%,45.0,7.0,356805.0,36706.5,3761.0,1463.75,17.0
max,59.0,9.0,499401.0,49923.0,4971.0,1998.0,23.0


In [10]:
df.isnull().any()

Unnamed: 0,0
video_id,False
title,False
duration_sec,False
hashtags_count,False
views,False
likes,False
comments,False
shares,False
upload_hour,False
category,False


In [11]:
# Checking shape of dataframe
print(f'Number of rows in training dataset: {df.shape[0]}')
print(f'Number of columns in training dataset: {df.shape[1]}')

Number of rows in training dataset: 300
Number of columns in training dataset: 10


In [12]:
categorical_cols = df.select_dtypes(include='object').columns
categorical_cols

Index(['video_id', 'title', 'category'], dtype='object')

In [13]:
def classify_columns(data):
    # Initialize lists for categorical and numerical columns
    cat_col = []
    num_col = []

    # Iterate through columns in the DataFrame
    for column in data.columns:
        # Count the number of unique values in the column
        unique_values = data[column].nunique()

        # If there are fewer than 3 unique values, consider it a categorical column
        if unique_values < 3:
            cat_col.append(column)
        # Otherwise, consider it a numerical column
        else:
            num_col.append(column)

    # Return the lists of categorical and numerical columns
    return cat_col, num_col


In [14]:
cat_col, num_col = classify_columns(df)
print(f'Categorical columns in training dataset: {cat_col}')
print(f'Numerical columns in training dataset: {num_col}')

Categorical columns in training dataset: []
Numerical columns in training dataset: ['video_id', 'title', 'duration_sec', 'hashtags_count', 'views', 'likes', 'comments', 'shares', 'upload_hour', 'category']


In [15]:
# Perform correlation analysis to identify relationships between features
correlation = df.corr(numeric_only=True)
correlation["likes"].sort_values(ascending=False)

Unnamed: 0,likes
likes,1.0
views,0.035115
hashtags_count,0.011884
upload_hour,-0.014855
comments,-0.034672
shares,-0.03739
duration_sec,-0.052211


In [16]:
likes_count = df['likes'].value_counts()
likes_count

Unnamed: 0_level_0,count
likes,Unnamed: 1_level_1
6471,2
27634,1
19316,1
43436,1
1442,1
...,...
10367,1
45640,1
38618,1
8252,1
