## Import Libraries & Read Data

In [18]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re

df = pd.read_csv('videos.csv')

## First Look At Data

In [19]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    500 non-null    int64 
 1   videoId       500 non-null    object
 2   publishedAt   500 non-null    object
 3   title         500 non-null    object
 4   tags          500 non-null    object
 5   duration      500 non-null    object
 6   viewCount     500 non-null    int64 
 7   likeCount     500 non-null    int64 
 8   commentCount  500 non-null    int64 
dtypes: int64(4), object(5)
memory usage: 35.3+ KB
None


Unnamed: 0.1,Unnamed: 0,videoId,publishedAt,title,tags,duration,viewCount,likeCount,commentCount
0,0,20_wpq65YB4,2023-06-29T15:01:15Z,James Maddison MASTERCLASS vs Nottingham Fores...,"['James Maddison', 'Maddison', 'masterclass', ...",PT6M2S,27837,949,70
1,1,2nimbJKUqAE,2023-06-29T09:31:36Z,Eberechi Eze with a SENSATIONAL solo goal!,"['Eberechi Eze', 'Eze', 'solo goal', 'solo goa...",PT12S,197633,7495,49
2,2,x3juwrll0n8,2023-06-28T20:04:48Z,James Maddison first time finish vs Spurs,"['James Maddison', 'Maddison', 'Spurs', 'Totte...",PT28S,887069,30172,79
3,3,7O3FzXkm0UY,2023-06-28T19:09:21Z,Arsenal GK denies Kai Havertz ❗️ #shorts,"['Arsenal', 'Kai Havertz', 'Chelsea', 'Havertz...",PT24S,537480,16988,134
4,4,Cj78dlDLgdE,2023-06-28T17:19:57Z,STUNNING Edouard Mendy save! #shorts,"['Edouard Mendy', 'Edouard Mendy save', 'Edoua...",PT8S,202941,7143,40


## Data Cleaning:
### Correcting Data

In [20]:
df = df.drop(['Unnamed: 0'], axis=1) #drop unncessary columns

df['publishedAt'] = pd.to_datetime(df['publishedAt'], format='%Y-%m-%dT%H:%M:%SZ') #modify date data type

df['tags'] = df['tags'].str.strip('[]').str.strip("''").str.split("', '") #modify tags column data type

hours = pd.to_numeric(df['duration'].str.extract('(\d+H)').squeeze().str[:-1].fillna(0))
mins = pd.to_numeric(df['duration'].str.extract('(\d+M)').squeeze().str[:-1].fillna(0))
seconds = pd.to_numeric(df['duration'].str.extract('(\d+S)').squeeze().str[:-1].fillna(0))
df['duration'] = hours * 3600 + mins * 60 + seconds #modify duartion to seconds

#Feature Engineering
df['likeRatio'] = df['likeCount'] / df['viewCount']
df['commentRatio'] = df['commentCount'] / df['viewCount']

df['tagCount'] = df['tags'].str.len()
df['titleLength'] = df['title'].str.len()

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   videoId       500 non-null    object        
 1   publishedAt   500 non-null    datetime64[ns]
 2   title         500 non-null    object        
 3   tags          500 non-null    object        
 4   duration      500 non-null    int64         
 5   viewCount     500 non-null    int64         
 6   likeCount     500 non-null    int64         
 7   commentCount  500 non-null    int64         
 8   likeRatio     500 non-null    float64       
 9   commentRatio  500 non-null    float64       
 10  tagCount      500 non-null    int64         
 11  titleLength   500 non-null    int64         
dtypes: datetime64[ns](1), float64(2), int64(6), object(3)
memory usage: 47.0+ KB
None


Unnamed: 0,videoId,publishedAt,title,tags,duration,viewCount,likeCount,commentCount,likeRatio,commentRatio,tagCount,titleLength
0,20_wpq65YB4,2023-06-29 15:01:15,James Maddison MASTERCLASS vs Nottingham Fores...,"[James Maddison, Maddison, masterclass, Totten...",362,27837,949,70,0.034091,0.002515,31,64
1,2nimbJKUqAE,2023-06-29 09:31:36,Eberechi Eze with a SENSATIONAL solo goal!,"[Eberechi Eze, Eze, solo goal, solo goal footb...",12,197633,7495,49,0.037924,0.000248,33,42
2,x3juwrll0n8,2023-06-28 20:04:48,James Maddison first time finish vs Spurs,"[James Maddison, Maddison, Spurs, Tottenham, T...",28,887069,30172,79,0.034013,8.9e-05,33,41
3,7O3FzXkm0UY,2023-06-28 19:09:21,Arsenal GK denies Kai Havertz ❗️ #shorts,"[Arsenal, Kai Havertz, Chelsea, Havertz, Kai H...",24,537480,16988,134,0.031607,0.000249,38,40
4,Cj78dlDLgdE,2023-06-28 17:19:57,STUNNING Edouard Mendy save! #shorts,"[Edouard Mendy, Edouard Mendy save, Edouard Me...",8,202941,7143,40,0.035197,0.000197,38,36


## EDA

In [None]:
plt.figure(figsize = (12, 6))

plt.subplot(221)
sns.histplot(np.log(df['seconds'] + 1), kde=True, bins = 50)

plt.subplot(222)
sns.histplot(np.log(df['viewCount'] + 1), kde=True, bins = 50)

plt.subplot(223)
sns.histplot(np.log(df['likeCount'] + 1), kde=True, bins = 50)

plt.subplot(224)
sns.histplot(np.log(df['commentCount'] + 1), kde=True, bins = 50)

plt.show()

In [None]:
plt.figure(figsize = (8, 12))

plt.subplot(311)
plt.scatter(np.log(df['viewCount'] + 1), np.log(df['likeCount'] + 1), s=5)

plt.subplot(312)
plt.scatter(np.log(df['commentCount'] + 1), np.log(df['likeCount'] + 1), s=5)

plt.subplot(313)
plt.scatter(np.log(df['seconds'] + 1), np.log(df['likeCount'] + 1), s=5)

plt.plot()

In [None]:
plt.plot(df['publishedAt'], df['likeRatio'])

In [None]:
from nltk.corpus import stopwords

eng_stopwords = set(stopwords.words('english'))

In [None]:
df['titleWordCount'] = df['title'].str.split().str.len()


In [None]:
df2 = pd.DataFrame(df['tags'].explode().unique(), columns=['tag'])

df2['likeRatio'] = [np.mean(df['likeRatio'][df['tags'].str.contains(i, regex=False)]) for i in df2['tag']]
df2['freq'] = [np.sum(df['tags'].str.contains(i, regex=False)) for i in df2['tag']]

df2 = df2[df2['freq'] > 10]

In [None]:
df2.sort_values(by=['likeRatio'], ascending=False).head(n=20)

In [None]:
df2.sort_values(by=['freq'], ascending=False).head(n=20)