# Youtube dislike prediction in real time - working with a combination of data; practical guide

Dataset available at: https://www.kaggle.com/datasets/dmitrynikolaev/youtube-dislikes-dataset

## Loading data

In [1]:
!unzip -q archive.zip

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
dataset = pd.read_csv('youtube_dislike_dataset.csv')
dataset.head()

Unnamed: 0,video_id,title,channel_id,channel_title,published_at,view_count,likes,dislikes,comment_count,tags,description,comments
0,--0bCF-iK2E,Jadon Sancho Magical Skills & Goals,UC6UL29enLNe4mqwTfAyeNuw,Bundesliga,2021-07-01 10:00:00,1048888,19515,226,1319,football soccer ftbol alemn Bundesliga season ...,Enjoy the best skills and goals from Jadon San...,"Respect to Dortmund fans,must be sad losing hi..."
1,--14w5SOEUs,Migos - Avalanche (Official Video),UCGIelM2Dj3zza3xyV3pL3WQ,MigosVEVO,2021-06-10 16:00:00,15352638,359277,7479,18729,Migos Avalanche Quality Control Music/Motown R...,"Watch the the official video for Migos - ""Aval...",Migos just makes me want to live my live to th...
2,--40TEbZ9Is,Supporting Actress in a Comedy: 73rd Emmys,UClBKH8yZRcM4AsRjDVEdjMg,Television Academy,2021-09-20 01:03:32,925281,11212,401,831,,Hannah Waddingham wins the Emmy for Supporting...,Hannah's energy bursts through any screen. Wel...
3,--4tfbSyYDE,JO1'YOUNG (JO1 ver.)' PERFORMANCE VIDEO,UCsmXiDP8S40uBeJYxvyulmA,JO1,2021-03-03 10:00:17,2641597,39131,441,3745,PRODUCE101JAPAN JO1 TheSTAR STA...,JO1'YOUNG (JO1 ver.)' PERFORMANCE VIDEO\n\n---...,youngVer>< REN is really PERFECT. It's not ju...
4,--DKkzWVh-E,Why Retaining Walls Collapse,UCMOqf8ab-42UUQIdVoKwjlQ,Practical Engineering,2021-12-07 13:00:00,715724,32887,367,1067,retaining wall New Jersey highway Direct Conne...,One of the most important (and innocuous) part...,Keep up with all my projects here: https://pr...


In [4]:
file_US_ids = open("video_IDs/unique_ids_US.txt", "r")
US_ids = file_US_ids.read().splitlines()

In [5]:
dataset = dataset[dataset['video_id'].isin(US_ids)]

In [6]:
dataset.shape 

(15835, 12)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15835 entries, 1 to 37419
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   video_id       15835 non-null  object
 1   title          15835 non-null  object
 2   channel_id     15835 non-null  object
 3   channel_title  15835 non-null  object
 4   published_at   15835 non-null  object
 5   view_count     15835 non-null  int64 
 6   likes          15835 non-null  int64 
 7   dislikes       15835 non-null  int64 
 8   comment_count  15835 non-null  int64 
 9   tags           15835 non-null  object
 10  description    15835 non-null  object
 11  comments       15818 non-null  object
dtypes: int64(4), object(8)
memory usage: 1.6+ MB


In [8]:
dataset = dataset[['view_count', 'published_at','likes', 'comment_count','tags','description','dislikes']]
dataset.head()

Unnamed: 0,view_count,published_at,likes,comment_count,tags,description,dislikes
1,15352638,2021-06-10 16:00:00,359277,18729,Migos Avalanche Quality Control Music/Motown R...,"Watch the the official video for Migos - ""Aval...",7479
2,925281,2021-09-20 01:03:32,11212,831,,Hannah Waddingham wins the Emmy for Supporting...,401
4,715724,2021-12-07 13:00:00,32887,1067,retaining wall New Jersey highway Direct Conne...,One of the most important (and innocuous) part...,367
5,36124750,2021-12-01 09:00:03,965069,59657,Kpop girl group 1theK Starshiptv starship MV...,IVE Twitter\n: https://twitter.com/IVEstarship...,16618
8,535044,2021-08-06 12:10:25,9207,1900,the breakfast club breakfast club power1051 ce...,Subscribe NOW to The Breakfast Club: http://ih...,384


## Checking and removing null values

In [9]:
dataset.isna().sum().sum()

0

In [10]:
dataset.replace(" ", np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [11]:
dataset.isna().sum()

view_count          0
published_at        0
likes               0
comment_count       0
tags             2265
description       268
dislikes            0
dtype: int64

In [12]:
dataset = dataset.dropna()
dataset.isna().sum().sum()

0

In [13]:
dataset.shape

(13536, 7)

## Creating time Feature

In [14]:
from datetime import datetime
def calTime(time):
  start = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
  end =   datetime.strptime('13/12/2021 00:00:00', '%d/%m/%Y %H:%M:%S') # assuming this is the date that this dataset was extracted
  return np.round((end - start).total_seconds() / 60, 2)

In [15]:
dataset['timesec'] = dataset['published_at'].apply(calTime)

## Cleaning text

In [16]:
import re
from string import punctuation 
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [17]:
contraction_map={
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd've": "how did have",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "might have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "shall'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "will't've": "will not have",
    "would've": "would have",
    "would't": "would not",
    "would't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you have all",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
}

In [18]:

lemmatizer = WordNetLemmatizer()
in_words = set(nltk.corpus.words.words())
def clean_text(text):
    text = str(text)
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub('"','', text)
    text = ' '.join([contraction_map[t] if t in contraction_map else t for t in text.split(" ")])
    text = re.sub(r"'s\b","",text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in in_words or not w.isalpha())

    text = [word for word in text.split( ) if word not in stopwords.words('english')]
    text = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(text)
    text = text.lower()
    return text

In [19]:
%time dataset['clean_description'] = dataset['description'].apply(clean_text)
%time dataset['clean_tags'] = dataset['tags'].apply(clean_text)

https://www.scribbleshowdown.com/
https://www.scribbleshowdown.com/" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


CPU times: user 1min 59s, sys: 14.4 s, total: 2min 13s
Wall time: 2min 16s
CPU times: user 28.5 s, sys: 3.16 s, total: 31.6 s
Wall time: 31.6 s


## Splitting data

In [20]:
dataset_train = dataset.iloc[:12500,:]
dataset_test = dataset.iloc[12501:,:]

In [21]:
X_train = dataset_train.loc[:, dataset.columns != 'dislikes']
Y_train = dataset_train['dislikes'].values

In [22]:
X_test = dataset_test.loc[:, dataset.columns != 'dislikes']
Y_test = dataset_test['dislikes'].values

In [23]:
X_train_numaric = X_train[['view_count', 'likes', 'comment_count', 'timesec']].values
X_train_tags = X_train['clean_tags'].values
X_train_desc = X_train['clean_description'].values

In [24]:
X_test_numaric = X_test[['view_count', 'likes', 'comment_count', 'timesec']].values
X_test_tags = X_test['clean_tags'].values
X_test_desc = X_test['clean_description'].values

## Text tokenization

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
def Tokenizer_func(train,test, max_words_length=0, max_seq_len=100):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train)
    
    max_words = 0
    if max_words_length > 0:
        max_words = max_words_length
    else:
        max_words = len(tokenizer.word_counts.items())

    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train)

    train_sequences = tokenizer.texts_to_sequences(train)
    test_sequences = tokenizer.texts_to_sequences(test)
    
    train = pad_sequences(train_sequences,maxlen=max_seq_len, padding='post')
    test = pad_sequences(test_sequences,maxlen=max_seq_len, padding='post')
    
    voc = tokenizer.num_words +1
    return {'train': train, 'test': test, 'voc': voc, 'max_words':max_words, 'tokenizer': tokenizer}

For tags (text)

In [27]:
X_tags_processed = Tokenizer_func(X_train_tags,X_test_tags)

In [28]:
X_train_tags,X_test_tags,x_tags_voc,x_tags_max_words,x_tag_tok = X_tags_processed['train'], X_tags_processed['test'],X_tags_processed['voc'],X_tags_processed['max_words'],X_tags_processed['tokenizer']

For description (text)

In [29]:
X_desc_processed = Tokenizer_func(X_train_desc,X_test_desc)

In [30]:
X_train_desc, X_test_desc,x_desc_voc,x_desc_max_words,x_desc_tok = X_desc_processed['train'], X_desc_processed['test'],X_desc_processed['voc'],X_desc_processed['max_words'],X_desc_processed['tokenizer']

## Numeric data normalization

In [31]:
from sklearn.preprocessing import StandardScaler

In [32]:
Sc = StandardScaler()
X_train_numaric = Sc.fit_transform(X_train_numaric)
X_test_numaric = Sc.transform(X_test_numaric)

## Creating the Model and Training

In [33]:
from keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, concatenate,LayerNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [34]:
tagsInput = Input(shape=(None,), name='tags')
descInput = Input(shape=(None,), name='desc')
numaricInput = Input(shape=(4,), name='numaric')

tags = Embedding(input_dim=x_tags_voc,output_dim=8,input_length=x_tags_max_words)(tagsInput)
tags = LSTM(100,dropout=0.2, return_sequences=True)(tags)
tags = LayerNormalization()(tags)
tags = LSTM(100,dropout=0.4, return_sequences=False)(tags)
tags = LayerNormalization()(tags)

desc = Embedding(input_dim=x_desc_voc,output_dim=8,input_length=x_desc_max_words)(descInput)
desc = LSTM(100,dropout=0.2, return_sequences=True)(desc)
desc = LayerNormalization()(desc)
desc = LSTM(100,dropout=0.4, return_sequences=False)(desc)
desc = LayerNormalization()(desc)

combined = concatenate([tags, desc,numaricInput])
x = Dense(256,activation='relu')(combined)
x = Dense(128,activation='relu')(x)
x = Dense(32,activation='relu')(x)
x = Dense(1,use_bias=True,activation='linear')(x)
model = Model([tagsInput, descInput,numaricInput], x)

In [35]:
model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001, decay=0.001 / 20), metrics=['mae'])

In [36]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 tags (InputLayer)              [(None, None)]       0           []                               
                                                                                                  
 desc (InputLayer)              [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 8)      71368       ['tags[0][0]']                   
                                                                                                  
 embedding_1 (Embedding)        (None, None, 8)      129976      ['desc[0][0]']                   
                                                                                              

In [37]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

Reshaping the text data since LSTM requires 3 dimensional data

In [38]:
X_train_tags = np.reshape(X_train_tags,(X_train_tags.shape[0],X_train_tags.shape[1],1))
X_train_desc = np.reshape(X_train_desc,(X_train_desc.shape[0],X_train_desc.shape[1],1))
X_train_tags.shape, X_train_desc.shape, X_train_numaric.shape

((12500, 100, 1), (12500, 100, 1), (12500, 4))

In [49]:
history = model.fit(
                    x=[X_train_tags, X_train_desc,X_train_numaric],
                    y=Y_train,
                    epochs=500, 
                    batch_size=25,
                    validation_split=0.2,
                    verbose=1,
                    callbacks=[es]
                  )

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 26: early stopping


## Evaluation matrix

In [52]:
pprydd = model.predict([X_test_tags, X_test_desc, X_test_numaric])
from sklearn import metrics
print("Mean Absolute Error (MAE) - Test data : ", metrics.mean_absolute_error(Y_test, pprydd))
print("Mean Squared Error (MSE) - Test data : ", metrics.mean_squared_error(Y_test, pprydd))
print("Root Mean Squared Error (RMSE) - Test data : ", np.sqrt(metrics.mean_squared_error(Y_test, pprydd)))
print("Co-efficient of determination (R2 Score): ", metrics.r2_score(Y_test, pprydd))

Mean Absolute Error (MAE) - Test data :  3305.416272267512
Mean Squared Error (MSE) - Test data :  226826599.84264407
Root Mean Squared Error (RMSE) - Test data :  15060.76358763539
Co-efficient of determination (R2 Score):  0.8716876178939388


## Real-Time

In [53]:
import googleapiclient.discovery

In [54]:
DEVELOPER_KEY = 'YOUR_API_KEY' 
youtube_client = googleapiclient.discovery.build('youtube', 'v3', developerKey=DEVELOPER_KEY)

In [55]:
def realtime(youtube,video_id):
  def calTimesss(time):
    start = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z')
    end =   datetime.now()
    return np.round((end - start.replace(tzinfo=end.tzinfo)).total_seconds() / 60, 2)
  request = youtube.videos().list(part="snippet, statistics",id=video_id)
  response = request.execute()

  desc = response['items'][0]['snippet']['description']
  desc  =[clean_text(desc)]
  desc = x_desc_tok.texts_to_sequences(desc)
  desc = pad_sequences(desc,maxlen=100, padding='post')
  
  tags = response['items'][0]['snippet']['tags']
  tags=(" ".join(tags))
  tags  =[clean_text(tags)]
  tags = x_tag_tok.texts_to_sequences(tags)
  tags = pad_sequences(tags,maxlen=100, padding='post')

  publishedAt = response['items'][0]['snippet']['publishedAt']
  timesec = calTimesss(publishedAt)
  viewcount = response['items'][0]['statistics']['viewCount']
  likeCount = response['items'][0]['statistics']['likeCount']
  commentCount = response['items'][0]['statistics']['commentCount']
  numaricdata = [[viewcount, likeCount,commentCount,timesec]]
  numaricdata = Sc.transform(numaricdata)

  pryd = model.predict([tags, desc, numaricdata])

  return {"predicted": int(pryd[0][0]), "info": {
      "video_id": video_id,
      "likes": likeCount,
      "commentCount": commentCount,
      "viewCount": viewcount,
      "publishedAt": publishedAt,
      "dislike": int(pryd[0][0]),
  }}

In [56]:
video_id = "videoid"

In [57]:
realtime(youtube_client, video_id)



{'predicted': 1231,
 'info': {'video_id': '8oENQDbSgQg',
  'likes': '2136',
  'commentCount': '54',
  'viewCount': '82122',
  'publishedAt': '2022-10-23T20:35:36Z',
  'dislike': 1231}}