# EXTRACTIVE SUMMARIZATION

#### Dependencies

* To avoid dependency issues, install the following versions

Python = 3.6.9 <br>
torch==1.7.0 <br> 
spacy==2.3.1 <br>
bert-extractive-summarizer <br>

In [54]:
from summarizer import Summarizer
import traceback 
import re

### Parsing SRT files

In [7]:
def subtitle_to_textblob(subtitle_file):

    input_text_list = list()
    input_times_list = list()

    count = 0
    with open(subtitle_file, 'r') as fp:
        input_lines = fp.readlines()
        for line in input_lines:
            line = line.strip()

            # print('Count ', count)
            if (line):
                # Process line numbers
                if (count == 0):
                    count += 1
                elif (count == 1):
                    input_times_list.append(line)
                    count += 1
                elif (count == 2):
                    input_text_list.append(line)
                    count = 0
    return input_text_list, input_times_list

In [8]:
def extractive_summarization(input_text, num_sentences, debug=False):

    model = Summarizer()
    output_text = model(input_text, num_sentences=num_sentences-1)
    
    if (debug):        
        print('----------------- TOP',str(num_sentences),'SENTENCES -----------------')
        print(output_text)
        print('----------------------------------------------------')
        
    return output_text

In [74]:
def extracted_text_to_output(input_text, output_text, output_file, input_times_list, time_delimiter):
    
    try:
        extracted_sentences = re.split(r'[.!?\n]\s*',output_text.strip())
        print('Size of times list: ', len(input_times_list), ':Num Sentences = ', len(extracted_sentences))

        count = 0
        with open(output_file, 'w+') as fp:

            for sentence in extracted_sentences:

                sentence = sentence.strip()

                if (sentence):

                    # print(sentence)
                    search_list = list(sentence.split())

                    end_char_index = input_text.find(sentence)
                    start_word_index = len(input_text[:end_char_index].split())
                    end_word_index = start_word_index + len(search_list)-1
                    
                    # print(end_char_index, start_word_index, end_word_index)
                    
                    # print(start_word_index, end_word_index)

                    start_ip_time = input_times_list[start_word_index].split(time_delimiter)[0].strip()
                    # print(start_ip_time)

                    end_ip_time = input_times_list[end_word_index].split(time_delimiter)[1].strip()
                    # print(end_ip_time)

                    fp.write(start_ip_time+time_delimiter+end_ip_time+time_delimiter+sentence+'\n')
                    count += 1
                    print(count, end=' ')
    except:
        print('Exception in extracted_text_to_output()')
        traceback.print_exc()

## Run if output_text from extractive summarization is already available

In [39]:
def extract_from_output_text(input_file, output_file, output_text_file, num_sentences):

    try:
        time_delimiter = '-->'

        input_text_list, input_times_list = subtitle_to_textblob(input_file)

        input_text = ' '.join(input_text_list)
        # print(input_text)
        output_text = ''
        
        with open(output_text_file, 'r') as fp:
            output_text = fp.read()
            
        extracted_text_to_output(input_text, output_text, output_file, input_times_list, time_delimiter)
        print('Output with timestamps written to ', output_file)
            
    except:
        print('Exception in extract()')
        traceback.print_exc()

### Example to run 'extract_from_output_text()'

In [75]:
num_sentences_list = [15,20,25,30,35,40,45,50,60,70,80,90,100]
input_file = 'data/podcast__transcription_test.srt'

for num_sentences in num_sentences_list:
    output_text_file = input_file.split('.')[0] + '_optext_' + str(num_sentences) + '.txt'
    output_file = input_file.split('.')[0] + '_op_' + str(num_sentences) + '.txt'
    
    extract_from_output_text(input_file, output_file, output_text_file, num_sentences)

Size of times list:  17425 :Num Sentences =  16
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 Output with timestamps written to  data/podcast__transcription_test_op_15.txt
Size of times list:  17425 :Num Sentences =  24
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 Output with timestamps written to  data/podcast__transcription_test_op_20.txt
Size of times list:  17425 :Num Sentences =  27
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 Output with timestamps written to  data/podcast__transcription_test_op_25.txt
Size of times list:  17425 :Num Sentences =  33
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 Output with timestamps written to  data/podcast__transcription_test_op_30.txt
Size of times list:  17425 :Num Sentences =  37
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 Output with timestamps written to  data/podcast__transcription_test_op_35.txt
Size of times list:

## Run Extractive summarization

In [18]:
def extract_from_srt(input_file, output_file, num_sentences):

    try:
        time_delimiter = '-->'

        input_text_list, input_times_list = subtitle_to_textblob(input_file)

        input_text = ' '.join(input_text_list)

        

        output_text = extractive_summarization(input_text, num_sentences, True)
        
        with open(output_file, 'w') as fp:
            fp.write(output_text)
        
        extracted_text_to_output(input_text, output_text, output_file, input_times_list, time_delimiter)
    except:
        print('Exception in extract()')
        traceback.print_exc()

### Example to run 'extract_from_srt()'

In [None]:
num_sentences_list = [10,15,20,25,30,35,40,45,50,60,70,80,90,100]

ip_file = 'data/podcast__transcription_test.srt'

for num_sentences in num_sentences_list:
    
    op_file = ip_file.split('.')[0] + '_op_' + str(num_sentences) + '.txt'
    
    # print('START ---', op_file)
    extract_from_srt(ip_file, op_file, num_sentences)
    # print('END ---', op_file)

### Sample to run single iteration over num_sentences

In [19]:
'''
ip_file = 'data/AE_Shopify Walkthrough 1.srt'
op_file = 'data/AE_Shopify Walkthrough 1_op.txt'
num_sentences = 10

extract_from_srt(ip_file, op_file, num_sentences)
'''

"\nip_file = 'data/AE_Shopify Walkthrough 1.srt'\nop_file = 'data/AE_Shopify Walkthrough 1_op.txt'\nnum_sentences = 10\n\nextract(ip_file, op_file, num_sentences)\n"

In [67]:
ip_file = 'data/podcast__transcription_test.srt'

op_1 = 15
op_2 = 10

delimiter = '-->'

output_file1 = input_file.split('.')[0] + '_op_' + str(op_1) + '.txt'
output_file2= input_file.split('.')[0] + '_op_' + str(op_2) + '.txt'

list1 = list()
with open(output_file1, 'r') as fp:
    input_lines = fp.readlines()
    for line in input_lines:
        times = line.split(delimiter)
        list1.append(times[0]+' '+times[1])
        
list2 = list()
with open(output_file2, 'r') as fp:
    input_lines = fp.readlines()
    for line in input_lines:
        times = line.split(delimiter)
        list2.append(times[0]+' '+times[1])
        
print('list1 length: ', len(list1))
print('list2 length: ', len(list2))
print('Common elements', len(set(list1).intersection(list2)))

list1 length:  8
list2 length:  10
Common elements 4


In [68]:
input_text_list, input_times_list = subtitle_to_textblob('data/podcast__transcription_test.srt')
input_text = ' '.join(input_text_list)
print(input_text)

At some point, we tried to raise more money and you know, the investor was like the entire natural deodorant industry is something like $30 million a year. So why would anyone be interested in investing in a category that only has $30 million a year run rates? And I was like, if the natural deodorant industry is $30 million a year where the entire natural deodorant, we're doing $30 million a year at this point, Okay. I got to say it. This was my favorite episode of the podcast. What you're about to hear is a episode I just recorded with Moise Ali. He is the founder of native deodorant, which is a natural deodorant brand. It's basically a $12 stick of deodorant. My wife just bought it. She doesn't know that I know Moise. She bought it because she didn't want to use deodorant that had aluminum, it parabens all kinds of funky chemicals that you see on the back of a deodorant stick. So this is the story of how he started it from his brother's dining room table, essentially, where he was do