In [2]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.17.0-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
Collecting tensorflow-intel==2.17.0 (from tensorflow)
  Downloading tensorflow_intel-2.17.0-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow-

In [4]:
pip install tf-keras


Collecting tf-keras
  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.17.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 1.7/1.7 MB 11.7 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.17.0
Note: you may need to restart the kernel to use updated packages.


In [30]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import json

In [31]:
# Download stopwords and other necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\apasi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\apasi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
# Load pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# Load transcript.json file
def load_transcript_json(transcript_path):
    with open(transcript_path, 'r') as f:
        transcript_data = json.load(f)
    return transcript_data

# Load transcript.txt file and remove newline characters
def load_transcript_txt(transcript_path):
    with open(transcript_path, 'r', encoding='latin-1') as f:
        text = f.read()
    # Remove newline characters
    cleaned_text = text.replace('\n', ' ')
    return cleaned_text

In [35]:
# Preprocess transcript text
def preprocess(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming (or lemmatization)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Join tokens back into a string for BERT input
    cleaned_text = ' '.join(stemmed_tokens)

    # Tokenize with BERT tokenizer
    input_ids = tokenizer.encode(cleaned_text, return_tensors='tf')

    return input_ids

In [17]:
# Training loop (optional, based on the dataset)
def train_model(training_data, model, tokenizer, num_epochs=3):
    optimizer = tf.keras.optimizers.Adam()
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    for epoch in range(num_epochs):
        for batch in training_data:
            inputs = preprocess(batch['text'])
            labels = batch['label']  # 0 for non-key, 1 for key
            with tf.GradientTape() as tape:
                outputs = model(inputs, labels=labels)
                loss = loss_fn(labels, outputs.logits)
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        print(f"Epoch {epoch + 1}: Loss = {loss.numpy()}")


In [36]:
# Training loop (optional, based on the dataset)
def train_model(training_data, model, tokenizer, num_epochs=3):
    optimizer = tf.keras.optimizers.Adam()
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    for epoch in range(num_epochs):
        for batch in training_data:
            inputs = preprocess(batch['text'])
            labels = batch['label']  # 0 for non-key, 1 for key
            with tf.GradientTape() as tape:
                outputs = model(inputs, labels=labels)
                loss = loss_fn(labels, outputs.logits)
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        print(f"Epoch {epoch + 1}: Loss = {loss.numpy()}")


In [37]:
# Evaluation loop (optional, for testing accuracy)
def evaluate_model(testing_data, model, tokenizer):
    y_true = []
    y_pred = []
    for batch in testing_data:
        inputs = preprocess(batch['text'])
        outputs = model(inputs)
        predicted_label = tf.argmax(outputs.logits, axis=-1).numpy()
        y_true.append(batch['label'])
        y_pred.append(predicted_label)

    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

# Identify key sections in the transcript data
def identify_key_sections(transcript_data, model, tokenizer):
    key_sections = []
    for segment in transcript_data:
        text = segment['text']
        inputs = preprocess(text)
        outputs = model(inputs)
        predicted_label = tf.argmax(outputs.logits, axis=-1).numpy()
        if predicted_label == 1:  # 1 represents key content
            key_sections.append({
                "start_time": segment['offset'],
                "end_time": segment['offset'] + segment['duration'],
                "text": text
            })
    return key_sections


In [None]:
# Main function
if __name__ == '__main__':
    # File paths
    transcript_json_path = 'transcript.json'
    transcript_txt_path = 'transcript.txt'

    # Load data from files
    transcript_data_json = load_transcript_json(transcript_json_path)
    transcript_data_txt = load_transcript_txt(transcript_txt_path)

    # Optional: Prepare labeled training data (you'll need to set this up)
    # Example: train_data = [{'text': 'sample text', 'label': 1}, ...]

    # Optional: Train the model on labeled data (if available)
    # train_model(train_data, model, tokenizer)

    # Optional: Prepare test data (you'll need to set this up)
    # Example: test_data = [{'text': 'sample test text', 'label': 0}, ...]

    # Optional: Evaluate the model on test data (if available)
    # evaluate_model(test_data, model, tokenizer)

    # Identify key sections in the transcript.json data
    key_sections = identify_key_sections(transcript_data_json, model, tokenizer)

    # Print the identified key sections
    for section in key_sections:
        print(section)