# CS 688 Project: Sound to Text and Text Summarization

## Libraries

In [1]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import librosa
import editdistance

## Sound to Text

### Converting speech to text

In [2]:
# load model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
fname = "TED_Talk.wav"; duration = 15.6
total_duration, segment_length, start_at = librosa.get_duration(filename=fname), 20, 0
segment_start_times = range(start_at, int(total_duration), segment_length)

	This alias will be removed in version 1.0.
  total_duration, segment_length, start_at = librosa.get_duration(filename=fname), 20, 0


In [43]:
sentences = actual_transcript.split(".")
sentences = [s for s in sentences if s]  # remove empty strings
sentences = [s.split("!") for s in sentences]
sentences = [item for sublist in sentences for item in sublist]  # flatten list of lists
sentences = [s.split("?") for s in sentences]
sentences = [item for sublist in sentences for item in sublist]  # flatten list of lists
num_sentences = len(sentences)
print("Number of sentences in transcript:",num_sentences)

Number of sentences in transcript: 165


In [4]:
with open("TED_Talk_Transcript.txt", "r") as f:
    actual_transcript = f.read().strip()

actual_transcript = actual_transcript.replace('/', '').upper()

In [5]:
num_segments = len(segment_start_times)
print("Number of segments:", num_segments)

Number of segments: 47


In [6]:
combined_transcription = ""
for i, start_time in enumerate(segment_start_times):

    print(f"Processing segment {i+1}/{num_segments}")

    start_time = i * segment_length + start_at
    end_time = start_time + segment_length

    # Load "segment_length" seconds of the file, starting at "start_at" seconds
    speech, rate = librosa.load(fname, sr=16000, offset=start_time, duration=segment_length)

    # Tokenize the waveform
    input_values = tokenizer(speech, return_tensors='pt').input_values

    # Retrieve logits from the model
    logits = model(input_values).logits

    # Take argmax value and decode into transcription
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)

    # Combine the transcriptions
    combined_transcription += " " + transcription[0]

Processing segment 1/47
Processing segment 2/47
Processing segment 3/47
Processing segment 4/47
Processing segment 5/47
Processing segment 6/47
Processing segment 7/47
Processing segment 8/47
Processing segment 9/47
Processing segment 10/47
Processing segment 11/47
Processing segment 12/47
Processing segment 13/47
Processing segment 14/47
Processing segment 15/47
Processing segment 16/47
Processing segment 17/47
Processing segment 18/47
Processing segment 19/47
Processing segment 20/47
Processing segment 21/47
Processing segment 22/47
Processing segment 23/47
Processing segment 24/47
Processing segment 25/47
Processing segment 26/47
Processing segment 27/47
Processing segment 28/47
Processing segment 29/47
Processing segment 30/47
Processing segment 31/47
Processing segment 32/47
Processing segment 33/47
Processing segment 34/47
Processing segment 35/47
Processing segment 36/47
Processing segment 37/47
Processing segment 38/47
Processing segment 39/47
Processing segment 40/47
Processin

### Performance

In [7]:
print("Predicted transcribed audio:", combined_transcription)

Predicted transcribed audio:  SO ANYONE WHO'S BEEN PAYING ATTENTION FOR THE LAST FEW MONTHS HAS BEEN SEEING HEADLINES LIKE THIS ESPECIALLY IN EDUCATION THE THESIS HAS BEEN STUDENTS ARE GOING TO BE USING CHAT G P T IN OTHER FORMS OF A I TO CHEAT DU THEIR ASIGNMENTS THEY 'RE NOT GOING TO LEARN ANS GOING TO COMPLETELY UNDERMINE EDUCATION AS WE KNOW IT NOW WHAT I'M AN ARGUE TODAY IS NOT ONLY ARE THEIR WAYS TO MITIGATE ALL THE FACT IF WE PUT THE RIGHT GADRAILS WE DO THE RIGHT THINGS WE CAN MITIGATE IT BUT I THING GERE AT THE COSP OF USING A I FOR PROBABLY THE BIGGEST TRANSPOSITIVE TRANSFORMATION THAT EDUCATION HAS EVER SEEN AND THE WAY WE'RE GOING TO DO THAT IS BY GIVING EVERY STUDENT ON THE PLANET AN ARTIFICIALLY INTELLIGENT BUT AMAZING PERSONAL TUTOR AND ARE GOING TO GIVE EVERY TEACHER ON THE PLANET A AN AMAZING ARTIFICIALLY INTELLIGENT TEACHING ASSISTANT AND JUST TO APPRECIATE HOW BIG OF A DEAL IT WOULD BE TO GIVE EVERY ONE A PERSONAL TUTOR I SHOW YOU THIS CLIP FROM BENJAMIN BLOOMS NINET

In [8]:
print("Actual transcribed audio:", actual_transcript)

Actual transcribed audio: SO ANYONE WHO'S BEEN PAYING ATTENTION FOR THE LAST FEW MONTHS HAS BEEN SEEING HEADLINES LIKE THIS, ESPECIALLY IN EDUCATION. THE THESIS HAS BEEN: STUDENTS ARE GOING TO BE USING CHATGPT AND OTHER FORMS OF AI TO CHEAT, DO THEIR ASSIGNMENTS. THEY’RE NOT GOING TO LEARN. AND IT’S GOING TO COMPLETELY UNDERMINE EDUCATION AS WE KNOW IT. NOW, WHAT I'M GOING TO ARGUE TODAY IS NOT ONLY ARE THERE WAYS TO MITIGATE ALL OF THAT, IF WE PUT THE RIGHT GUARDRAILS, WE DO THE RIGHT THINGS, WE CAN MITIGATE IT. BUT I THINK WE'RE AT THE CUSP OF USING AI FOR PROBABLY THE BIGGEST POSITIVE TRANSFORMATION THAT EDUCATION HAS EVER SEEN. AND THE WAY WE'RE GOING TO DO THAT IS BY GIVING EVERY STUDENT ON THE PLANET AN ARTIFICIALLY INTELLIGENT BUT AMAZING PERSONAL TUTOR. AND WE'RE GOING TO GIVE EVERY TEACHER ON THE PLANET AN AMAZING, ARTIFICIALLY INTELLIGENT TEACHING ASSISTANT. AND JUST TO APPRECIATE HOW BIG OF A DEAL IT WOULD BE TO GIVE EVERYONE A PERSONAL TUTOR, I SHOW YOU THIS CLIP FROM BENJA

In [9]:
ground_truth = combined_transcription
hypothesis = actual_transcript

wer = editdistance.eval(ground_truth.split(), hypothesis.split()) / len(ground_truth.split())

# Calculate CER
cer = editdistance.eval(ground_truth, hypothesis) / len(ground_truth)

# Print the results
print(f"Word Error Rate (WER): {wer:.2f}")
print(f"Character Error Rate (CER): {cer:.2f}")

Word Error Rate (WER): 0.30
Character Error Rate (CER): 0.11


## Text Summarization

In [30]:
with open("TED_Talk_Transcript.txt", "r") as f:
    actual_transcript = f.read().strip()

### 1. TextRank Algorithm

In [31]:
from gensim.summarization.summarizer import summarize

In [32]:
summary = summarize(actual_transcript, ratio=0.03)
print(summary)

And the way we're going to do that is by giving every student on the planet an artificially intelligent but amazing personal tutor.
And he had good data that showed that look, a normal distribution, that's the one that you see in the traditional bell curve right in the middle, that's how the world kind of sorts itself out, that if you were to give personal 1-to-1 to tutoring for students, then you could actually get a distribution that looks like that right.
It knows all the context of what the student is doing, and it understands that those ellipses are there to draw clouds, which I think is kind of mind-blowing.
It can answer the age-old question, “Why do I need to learn this?” And it asks Socratically, "Well, what do you care about?" And let's say the student says, "I want to be a professional athlete." And it says, "Well, learning about the size of cells, which is what this video is, that could be really useful for understanding nutrition and how your body works, etc." It can answe

### 2. Luhn's Algorithm

In [33]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import defaultdict

In [34]:
# tokenize the text into sentences and words
sentences = sent_tokenize(actual_transcript)
words = word_tokenize(actual_transcript)

# calculate the frequency of each word
freq = defaultdict(int)
for word in words:
    freq[word.lower()] += 1

# rank the sentences based on the frequency of important words
scores = defaultdict(int)
for i, sentence in enumerate(sentences):
    for word in word_tokenize(sentence.lower()):
        if word in freq:
            scores[i] += freq[word]

# select the top 3 sentences as the summary
summary = ' '.join([sentences[idx] for idx in sorted(scores, key=scores.get, reverse=True)[:4]])
print(summary)

I'm pretty convinced that the first line of reasoning is actually almost a self-fulfilling prophecy, that if we act with fear and if we say, "Hey, we've just got to stop doing this stuff," what's really going to happen is the rule followers might pause, might slow down, but the rule breakers, as Alexandr [Wang] mentioned, the totalitarian governments, the criminal organizations, they're only going to accelerate. Now, if the student makes a mistake, and this will surprise people who think large language models are not good at mathematics, notice, not only does it notice the mistake, it asks the student to explain their reasoning, but it's actually doing what I would say, not just even an average tutor would do, but an excellent tutor would do. So this is a little thing, and my eight year old is addicted to this, and he's not a kid that really liked writing before, but you can say, “I want to write a horror story,” and it says, "Ooh, a horror story, how spine-tingling and thrilling. And 

### 3. Latent Semantic Analysis (LSA)

In [35]:
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.utils import get_stop_words
from sumy.nlp.stemmers import Stemmer

In [37]:
# Create a parser object to parse the text
parser = PlaintextParser.from_file('TED_Talk_Transcript.txt', Tokenizer("english"))
stemmer = Stemmer("english")

# Create a LSA summarizer object
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words("english")

# Print the summary
for sentence in summarizer(parser.document, 4):
        print(sentence)

And it says, "To make the right cloud move as well, try adding a line of code inside the draw function that increments the right X variable by one pixel in each frame."
They can start to do stuff that once again, we never had the capability to give everyone a tutor, everyone a writing coach to actually dig in to reading at this level.
But we think this could be equally as powerful for the teacher to drive more personalized education and frankly save time and energy for themselves and for their students.
These large language models are so powerful, there's a temptation to say like, well, all these people are just going to slap them onto their websites, and it kind of turns the applications themselves into commodities.


### 4. LexRank

In [38]:
from sumy.summarizers.lex_rank import LexRankSummarizer

In [40]:
# Create a parser object to parse the text
parser = PlaintextParser.from_file('TED_Talk_Transcript.txt', Tokenizer("english"))
stemmer = Stemmer("english")

# Create a LSA summarizer object
summarizer = LexRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words("english")

# Print the summary
for sentence in summarizer(parser.document, 4):
        print(sentence)

Students can get into debates with the AI.
But we are showing that there's ways that the AI doesn't write for you, it writes with you.
And the reason is, there was a lot of work behind the scenes to make that happen.
Now, just to take a step back at a meta level, obviously we heard a lot today, the debates on either side.


In [None]:
# Load Packages
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

# For Strings
parser = PlaintextParser.from_file("TED_Talk_Transcript.txt",Tokenizer("english"))
from sumy.summarizers.text_rank import TextRankSummarizer

# Summarize using sumy TextRank
summarizer = TextRankSummarizer()
summary = summarizer(parser.document, 15)
text_summary=""

for sentence in summary:
    text_summary+=str(sentence)

print(text_summary)

Now, what I'm going to argue today is not only are there ways to mitigate all of that, if we put the right guardrails, we do the right things, we can mitigate it.And just to appreciate how big of a deal it would be to give everyone a personal tutor, I show you this clip from Benjamin Bloom’s 1984 2 sigma study, or he called it the “2 sigma problem.” The 2 sigma comes from two standard deviation, sigma, the symbol for standard deviation.And he had good data that showed that look, a normal distribution, that's the one that you see in the traditional bell curve right in the middle, that's how the world kind of sorts itself out, that if you were to give personal 1-to-1 to tutoring for students, then you could actually get a distribution that looks like that right.I'm going to show you the early stages of what our AI, which we call Khanmigo, what it can now do and maybe a little bit of where it is actually going.Now, if the student makes a mistake, and this will surprise people who think la

## Keyword & Keyphrases Extraction

### Libraries

In [None]:
from keybert import KeyBERT
import app.text_samples as ts

### Keyword Extraction

#### 1. Base BERT Model

In [None]:
doc = actual_transcript

In [None]:
model = KeyBERT('distilbert-base-nli-mean-tokens')
keywords = model.extract_keywords(doc, stop_words='english', top_n=5)

print("Top 5 Keywords:", keywords)

Top 5 Keywords: [('dystopian', 0.1881), ('accident', 0.1175), ('cheating', 0.1103), ('horror', 0.0984), ('fear', 0.0974)]


#### 2. Max Sum Similarity

In [None]:
# 2.3. Max Sum Similarity
# To diversity the results, we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n
# words and extract the combination that are the least similar to each other by cosine similarity.
model = KeyBERT('distilbert-base-nli-mean-tokens')
keywords1 = model.extract_keywords(doc, stop_words='english', use_maxsum=True, nr_candidates=20, top_n=5)
print("Top 5 Keyphrases using Max Sum Similarity:", keywords1)

Top 5 Keyphrases using Max Sum Similarity: [('pessimistic', 0.0602), ('war', 0.0652), ('tutorial', 0.0797), ('millionaire', 0.0874), ('teachers', 0.0887)]


#### 3. Maximal Marginal Relevance

In [None]:
# 2.4. Maximal Marginal Relevance
# To diversify the results, we can use Maximal Margin Relevance (MMR) to create keywords / keyphrases which is also based on cosine similarity.
# The results with high diversity:
model = KeyBERT('distilbert-base-nli-mean-tokens')
keywords2 = model.extract_keywords(doc, stop_words='english', use_mmr=True, diversity=0.7, top_n=5)
print("Top 5 Keywords using Maximal Marginal Relevance:", keywords2)

Top 5 Keywords using Maximal Marginal Relevance: [('dystopian', 0.1881), ('teachers', 0.0887), ('millionaire', 0.0874), ('months', -0.0229), ('arizona', -0.0651)]


#### 4. Sentence Transformer

In [None]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens", device="cpu")
model = KeyBERT(model=sentence_model)
keywords3 =  model.extract_keywords(doc, stop_words='english', use_maxsum=True, nr_candidates=30, top_n=5)
print("Top 5 Keywords using Sentence Transformer:", keywords3)

Top 5 Keywords using Sentence Transformer: [('mathematics', 0.0202), ('university', 0.0234), ('pessimistic', 0.0602), ('tutoring', 0.0645), ('millionaire', 0.0874)]


#### 5. Spacy Transformer

In [None]:
# Using spacy-transformer models:
import spacy

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
model = KeyBERT(model=nlp)
keywords5 = model.extract_keywords(doc, stop_words='english', use_maxsum=True, nr_candidates=20, top_n=5)
print("Top 5 Keywords using Spacy Transformer:", keywords5)

Top 5 Keywords using Spacy Transformer: [('choice', 0.3606), ('just', 0.3679), ('opportunity', 0.3694), ('variable', 0.3702), ('fast', 0.3909)]


### Keyphrases Extraction

#### 1. Base BERT Model

In [None]:
model = KeyBERT('distilbert-base-nli-mean-tokens')
keyphrases = model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', top_n=5)

print("Top 5 Keyphrases:", keyphrases)

Top 5 Keyphrases: [('student debating cancel', 0.4364), ('scaring accident ai', 0.4301), ('student canceling student', 0.4219), ('industrial revolution scary', 0.4011), ('cheating tool student', 0.3742)]


#### 2. Max Sum Similarity

In [None]:
# 2.3. Max Sum Similarity
# To diversity the results, we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n
# words and extract the combination that are the least similar to each other by cosine similarity.
model = KeyBERT('distilbert-base-nli-mean-tokens')
keyphrases1 = model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', use_maxsum=True, nr_candidates=20, top_n=5)
print("Top 5 Keyphrases using Max Sum Similarity:", keyphrases1)

Top 5 Keyphrases using Max Sum Similarity: [('world class tutor', 0.3111), ('months seeing headlines', 0.3114), ('canceling student debt', 0.325), ('totalitarian governments criminal', 0.3513), ('better math people', 0.365)]


#### 3. Maximal Marginal Relevance

In [None]:
# 2.4. Maximal Marginal Relevance
# To diversify the results, we can use Maximal Margin Relevance (MMR) to create keywords / keyphrases which is also based on cosine similarity.
# The results with high diversity:
model = KeyBERT('distilbert-base-nli-mean-tokens')
keyphrases2 = model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', use_mmr=True, diversity=0.7, top_n=5)
print("Top 5 Keyphrases using Maximal Marginal Relevance:", keyphrases2)

Top 5 Keyphrases using Maximal Marginal Relevance: [('student debating cancel', 0.4364), ('totalitarian governments criminal', 0.3513), ('high school arizona', 0.1407), ('mississippi river brings', 0.107), ('intelligent amazing personal', 0.0978)]


#### 4. Sentence Transformer

In [None]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens", device="cpu")
model = KeyBERT(model=sentence_model)
keyphrases3 =  model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', use_maxsum=True, nr_candidates=30, top_n=5)
print("Top 5 Keyphrases using Sentence Transformer:", keyphrases3)

Top 5 Keyphrases using Sentence Transformer: [('famous speech stanford', 0.299), ('teachers just saw', 0.3004), ('months seeing headlines', 0.3114), ('totalitarian governments criminal', 0.3513), ('better math people', 0.365)]


#### 5. Spacy Transformer

In [None]:
# Using spacy-transformer models:
import spacy

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
model = KeyBERT(model=nlp)
keyphrases4 = model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', use_maxsum=True, nr_candidates=20, top_n=5)
print("Top 5 Keyphrases using Spacy Transformer:", keyphrases4)

Top 5 Keyphrases using Spacy Transformer: [('tell mistake instead', 0.4454), ('life coach exactly', 0.448), ('socratically like oral', 0.4577), ('scaring accident ai', 0.4582), ('pessimistic view ai', 0.4835)]
