<a href="https://colab.research.google.com/github/parthh03/Youtube-Summarizer/blob/main/Youtube_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and import necessary libraries

In [2]:
!pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.1-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.1


In [3]:
import youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
import nltk
import re
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract YouTube video transcript using the video link

In [5]:
link = "https://www.youtube.com/watch?v=yWnacRo2VbA"
unique_id = link.split("=")[-1]
sub = YouTubeTranscriptApi.get_transcript(unique_id)
subtitle = " ".join([x['text'] for x in sub])

# Import the sentence tokenizer from NLTK

In [6]:
from nltk.tokenize import sent_tokenize

 # Replace newline characters with an empty string.

In [8]:
import nltk
nltk.download('punkt')

subtitle = subtitle.replace("n","")
sentences = sent_tokenize(subtitle)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Create a dictionary with sentence indices as keys and sentences as values

In [9]:
organized_sent = {k:v for v,k in enumerate(sentences)}

# Text Analysis with TF-IDF Vectorization

In [16]:
tf_idf = TfidfVectorizer(min_df=1,
                        max_df=1.0,  # Set max_df to 1.0 to effectively disable it
                        strip_accents='unicode',
                        max_features=None,
                        lowercase=True,
                        token_pattern=r'\w{1,}',
                        ngram_range=(1, 3),
                        use_idf=1,
                        smooth_idf=1,
                        sublinear_tf=1,
                        stop_words='english')

sentence_vectors = tf_idf.fit_transform(sentences)
sent_scores = np.array(sentence_vectors.sum(axis=1)).ravel()


# Extracting Top N Sentences based on TF-IDF Scores

In [19]:
import numpy as np
N = 3
top_n_sentences = [sentences[index] for index in np.argsort(sent_scores, axis=0)[::-1][:N]]
np.argsort(sent_scores, axis=0)[::-1][:N]

array([0])

In [20]:
# mapping the scored sentences with their indexes as in the subtitle
mapped_sentences = [(sentence,organized_sent[sentence]) for sentence in top_n_sentences]
# Ordering the top-n sentences in their original order
mapped_sentences = sorted(mapped_sentences, key = lambda x: x[1])
ordered_sentences = [element[0] for element in mapped_sentences]
# joining the ordered sentence
summary = " ".join(ordered_sentences)

# Install the transformers library and its modules

In [22]:
!pip install transformers
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
Col

# Initialize the tokenizer and model with the pre-trained 'facebook/bart-large-cnn'

In [23]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

# Encode the 'subtitle' text into a PyTorch tensor

In [24]:
input_tensor = tokenizer.encode( subtitle, return_tensors="pt", max_length=512)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Generate text using the input tensor and specified parameters

In [25]:
outputs_tensor = model.generate(input_tensor, max_length=160, min_length=120, length_penalty=2.0, num_beams=4, early_stopping=True)
outputs_tensor

tensor([[    2,     0,  9178,   203, 32871,   197,    47,   109,     7,  2217,
          5886,   103,   224,    47,   109,    75,   364,   196, 16749,  9641,
           643,   224,    47, 16112,   109,    24,   358, 17224,   459,   183,
            53,    71,  1994,  1023,     7,   103,     9,     5,   299,    23,
          9799,   809, 32798,   939,     5, 13561,  4193,  1506,    24,  8649,
         19420,  1023,     5, 35895,  2320,   939,     5,   232,  2329,  1551,
          1023,     5,   665,  2850, 39426,     4,  1234,    52,   364,   196,
             7,  1955,    66,   141,  7163, 32871,   888,    16,    13,  5886,
           872,  2329,     5,    25, 14682,   429,  2755,    47,   150, 32871,
            16,   372,    13,  1144,   474,  4600,  1899, 38639,  2329,  1639,
         16415,  2553,     9,    97,  6829,  2629,    24,    18, 19313,   419,
            25,  2375,    25,   144,    82,  3553,   967, 33116,    24,   606,
             7,  3774,  1023,  5886,    23,   513,  

# Decode the generated text and print it

In [26]:
print(tokenizer.decode(outputs_tensor[0]))

</s><s>how much cardio should you do to lose fat some say you do't eed ay whereas others say you gotta do it every sigle day but after speakig to some of the top atural bodybuilders i the idustry iterviewig the smartest experts i the world ad reviewig the latest sciece. First we eed to figure out how helpful cardio actually is for fat loss ad the aswer might surprise you while cardio is great for heart health logevity ad provides plety of other beefits it's ot early as effective as most people thik whe it comes to losig fat at least i the way most people do it.</s>
