In [None]:
###1

import os
from moviepy.video.io.VideoFileClip import VideoFileClip
from pydub import AudioSegment
import boto3

s3 = boto3.client('s3')
bucket_name = 'my-s3-doc'

def upload_to_s3(local_file_path, s3_key):
    s3.upload_file(local_file_path, bucket_name, s3_key)

def reduce_wav_size(input_file, output_file, sample_rate=22050, bit_depth=16, channels=1):
    sound = AudioSegment.from_wav(input_file)
    sound = sound.set_frame_rate(sample_rate)
    sound = sound.set_sample_width(bit_depth // 8)
    sound = sound.set_channels(channels)
    sound.export(output_file, format="wav")

video_path = r'C:\Users\Lenovo\Documents\Project-vs code\Amazon Transcribe\vtt_to_srt\videoplayback (2).mp4'
audio_path = r'4hrsdemo_1.wav'

video_clip = VideoFileClip(video_path)
audio_clip = video_clip.audio

desired_codec = "pcm_s16le"
desired_bitrate = "32k"

audio_clip.write_audiofile(audio_path, codec=desired_codec, bitrate=desired_bitrate)

audio = AudioSegment.from_wav(audio_path)
total_length = len(audio)
min = total_length / 60000

split_duration = 60 * 60 * 1000  # 60 minutes in milliseconds

if min > 60:
    num_splits = int(min / 60)
    remaining_time = min % 60

    for i in range(num_splits):
        start_time = i * split_duration
        end_time = (i + 1) * split_duration if (i + 1) * split_duration < total_length else total_length

        split = audio[start_time:end_time]

        split_filename = f"split_{i + 1}.wav"
        split.export(split_filename, format="wav")

        reduced_filename = f"reduced_{split_filename}"
        reduce_wav_size(split_filename, reduced_filename, sample_rate=15000, bit_depth=16, channels=1)

        s3_key = f"splits/{reduced_filename}"
        upload_to_s3(reduced_filename, s3_key)

    if remaining_time > 0:
        start_time = num_splits * split_duration
        end_time = total_length

        split = audio[start_time:end_time]

        split_filename = f"split_{num_splits + 1}.wav"
        split.export(split_filename, format="wav")

        reduced_filename = f"reduced_{split_filename}"
        reduce_wav_size(split_filename, reduced_filename, sample_rate=15000, bit_depth=16, channels=1)

        s3_key = f"splits/{reduced_filename}"
        upload_to_s3(reduced_filename, s3_key)
else:
    audio.export("full_audio.wav", format="wav")

    input_file = "full_audio.wav"
    output_file = "reduced_full_audio.wav"
    target_bitrate = 192
    reduce_wav_size(input_file, output_file, sample_rate=15000, bit_depth=16, channels=1)

    s3_key = f"splits/{output_file}"
    upload_to_s3(output_file, s3_key)

In [None]:
###2

import boto3
import time
import urllib
import json
import concurrent.futures
import uuid  # Importing the UUID module

bucket_name = 'my-s3-doc'
transcribe_client = boto3.client('transcribe')
s3_client = boto3.client('s3')

def extract_file_number(file_key):
    return int(file_key.split('_')[-1].split('.')[0])

def transcribe_file(file_uri, file_number, transcribe_client, s3_client, output_bucket_name):
    job_name = f"TranscriptionJob_{uuid.uuid4()}"  # Appending a UUID to ensure uniqueness
    transcribe_client.start_transcription_job(
        TranscriptionJobName=job_name,
        Media={'MediaFileUri': file_uri},
        MediaFormat='wav',
        LanguageCode='en-US',
        OutputBucketName=output_bucket_name,
        Subtitles={'Formats': ['srt']}
    )
    
    max_tries = 60
    srt_key = f"file{file_number}.srt"  # Naming the SRT file based on the file number
    while max_tries > 0:
        max_tries -= 1
        job = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
        job_status = job['TranscriptionJob']['TranscriptionJobStatus']
        
        if job_status == 'COMPLETED':
            print(f"Job {job_name} is {job_status}. Downloading SRT file...")
            s3_client.download_file(output_bucket_name, f"{job_name}.srt", srt_key)  # Downloading SRT file with the custom name
            print(f"SRT file downloaded: {srt_key}")
            break
        elif job_status == 'FAILED':
            print(f"Job {job_name} failed.")
            break
        else:
            print(f"Waiting for {job_name}. Current status is {job_status}.")
            time.sleep(10)

def transcribe_all_files_in_folder(folder_uri, transcribe_client, s3_client, output_bucket_name):
    folder_uri_parts = folder_uri.split('/')
    bucket_name = folder_uri_parts[2]
    folder_path = '/'.join(folder_uri_parts[3:])

    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)
    if 'Contents' in response:
        files = [obj for obj in response['Contents'] if obj['Key'][-1] != '/']  # Exclude directories
    else:
        print(f"No files found in the folder: {folder_uri}")
        return

    files.sort(key=lambda x: extract_file_number(x['Key']))  # Sorting files by file number

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for file in files:
            file_uri = f"s3://{bucket_name}/{file['Key']}"
            file_number = extract_file_number(file['Key'])
            future = executor.submit(transcribe_file, file_uri, file_number, transcribe_client, s3_client, output_bucket_name)
            futures.append(future)
        
        for future in futures:
            future.result()  # Wait for all transcription jobs to complete

folder_uri = 's3://my-s3-doc/splits/'
output_bucket_name = bucket_name
transcribe_all_files_in_folder(folder_uri, transcribe_client, s3_client, output_bucket_name)

In [None]:
###3

import os
import re

def add_time_offset(subtitle_content, hours):
    # Regex pattern to match timestamp format --> hh:mm:ss,mmm
    timestamp_pattern = r"(\d{2}:\d{2}:\d{2},\d{3})"

    # Function to increment timestamp by specified hours
    def increment_timestamp(match_obj):
        timestamp = match_obj.group(1)
        parts = timestamp.split(":")
        hours_offset = int(parts[0]) + hours
        return f"{hours_offset:02d}:{parts[1]}:{parts[2]}"

    # Apply the timestamp increment to the entire subtitle content
    return re.sub(timestamp_pattern, increment_timestamp, subtitle_content)

def combine_srt_files(directory):
    # Get a list of all .srt files in the directory
    srt_files = sorted([filename for filename in os.listdir(directory) if filename.endswith(".srt")])

    combined_content = ""
    hour_offset = 0
    for filename in srt_files:
        filepath = os.path.join(directory, filename)
        with open(filepath, "r", encoding="utf-8") as file:
            file_content = file.read()
            file_content_with_offset = add_time_offset(file_content, hour_offset)
            combined_content += file_content_with_offset + "\n\n"
            hour_offset += 1
    
    combined_filepath = os.path.join(directory, "combined_subtitles2.srt")
    with open(combined_filepath, "w", encoding="utf-8") as combined_file:
        combined_file.write(combined_content)

    print("Combined subtitles saved to:", combined_filepath)

# Example usage:
directory_path = "man"
combine_srt_files(directory_path)

In [1]:
###4

def parse_srt(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()

    subs = []
    sub = None
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.isdigit():
            if sub:
                subs.append(sub)
            sub = {'index': int(line)}
        elif '-->' in line:
            timestamps = line.split('-->')
            sub['start'] = timestamps[0].strip()
            sub['end'] = timestamps[1].strip()
        else:
            sub.setdefault('text', []).append(line)

    if sub:
        subs.append(sub)

    return subs

def print_srt_with_metadata(subs):
    for sub in subs:
        print(f"Index: {sub['index']}")
        print(f"Start: {sub['start']}, End: {sub['end']}")
        print("Text:")
        for line in sub['text']:
            print(line)
        print()

filename = r''
subs = parse_srt(filename)
# print_srt_with_metadata(subs)

In [2]:
###5

from langchain.prompts import PromptTemplate
from langchain.llms.bedrock import Bedrock
from langchain.chains import LLMChain
import os 
import boto3

bedrock_client = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-east-1",
)

template = """
You have provided with the datas, you need to print the datas along with the metadata, like in which time the text is available
Instruction:
1.The text contains of the metadata
2.You have to answer to the question asked with in which time frame the answer was found in the given text
3.You have to print the timeframe along with the text datas in which the text are taken from
4.Generate answers only from the provided text
5.Must mention timestamp for all the typea of answers
6.I don't want only index ,i need timestamp from where the answer was generated from
7.Must print only the answer shortly, don't mention unwanted texts.
8.Generate answers only for asked question and time frame ,do not generate extra answers
{datas}
{question}
"""
qa_prompt = PromptTemplate(template=template, input_variables=["datas","question"])
llm = Bedrock(model_id="anthropic.claude-v2:1",client=bedrock_client,model_kwargs = {"temperature":1e-10,"max_tokens_to_sample": 8191})

llm_chain = LLMChain(prompt=qa_prompt, llm=llm, verbose= False)

result = llm_chain.run(datas= subs,question = "what is the name of the academy")
print(result)

 The text mentions "Draft Academy" multiple times. Specifically:

00:27:29,989 - 00:27:54,689: 'Now, inside of the quotation mark, I can put whatever text I want the string to have. So we could say like Draft Academy.'

00:29:39,959 - 00:29:54,689: 'and I can just name this draft academy'

So the name of the academy mentioned in the text is "Draft Academy".


In [3]:
result = llm_chain.run(datas= subs,question = "what are they talking in first 10 minutes")
print(result)

 Here are the main topics covered in the first 10 minutes of the transcript:

00:00:00 to 00:03:53 - The instructor introduces the Python programming course, discusses installing Python and an IDE, and creates a first simple Python program that prints "Hello World".

00:03:53 to 00:06:43 - The instructor shows how to install Python and the PyCharm IDE on your computer.

00:06:43 to 00:10:30 - The instructor explains how a basic Python program runs by printing out a triangle shape using print statements. Topics covered include print statements, executing code line-by-line, and order of operations.

00:10:30 to 00:15:06 - The instructor introduces variables in Python for storing data, including strings, numbers, and booleans. He explains declaring variables, assigning values, modifying variables, and using variables in print statements.
