# EXTRACTIVE SUMMARIZATION

#### Dependencies

* To avoid dependency issues, install the following versions

Python = 3.6.9 <br>
torch==1.7.0 <br> 
spacy==2.3.1 <br>
bert-extractive-summarizer <br>

In [1]:
from summarizer import Summarizer
import traceback 
import re

### Parsing SRT files

In [2]:
def subtitle_to_textblob(subtitle_file):

    input_text_list = list()
    input_times_list = list()

    count = 0
    with open(subtitle_file, 'r') as fp:
        input_lines = fp.readlines()
        for line in input_lines:
            line = line.strip()

            # print('Count ', count)
            if (line):
                # Process line numbers
                if (count == 0):
                    count += 1
                elif (count == 1):
                    input_times_list.append(line)
                    count += 1
                elif (count == 2):
                    input_text_list.append(line)
                    count = 0
    return input_text_list, input_times_list

In [3]:
def extractive_summarization(input_text, num_sentences, debug=False):

    model = Summarizer()
    output_text = model(input_text, num_sentences=num_sentences-1)
    
    if (debug):        
        print('----------------- TOP',str(num_sentences),'SENTENCES -----------------')
        print(output_text)
        print('----------------------------------------------------')
        
    return output_text

In [4]:
def extracted_text_to_output(input_text, output_text, output_file, input_times_list, time_delimiter):
    
    try:
        extracted_sentences = re.split(r'[.!?\n]\s*',output_text.strip())
        print('Size of times list: ', len(input_times_list), ':Num Sentences = ', len(extracted_sentences))

        count = 0
        with open(output_file, 'w+') as fp:

            for sentence in extracted_sentences:

                sentence = sentence.strip()

                if (sentence):

                    # print(sentence)
                    search_list = list(sentence.split())

                    end_char_index = input_text.find(sentence)
                    start_word_index = len(input_text[:end_char_index].split())
                    end_word_index = start_word_index + len(search_list)-1
                    
                    # print(end_char_index, start_word_index, end_word_index)
                    
                    # print(start_word_index, end_word_index)

                    start_ip_time = input_times_list[start_word_index].split(time_delimiter)[0].strip()
                    # print(start_ip_time)

                    end_ip_time = input_times_list[end_word_index].split(time_delimiter)[1].strip()
                    # print(end_ip_time)

                    fp.write(start_ip_time+time_delimiter+end_ip_time+time_delimiter+sentence+'\n')
                    count += 1
                    print(count, end=' ')
    except:
        print('Exception in extracted_text_to_output()')
        traceback.print_exc()

## Run if output_text from extractive summarization is already available

In [5]:
def extract_from_output_text(input_file, output_file, output_text_file, num_sentences):

    try:
        time_delimiter = '-->'

        input_text_list, input_times_list = subtitle_to_textblob(input_file)

        input_text = ' '.join(input_text_list)
        # print(input_text)
        output_text = ''
        
        with open(output_text_file, 'r') as fp:
            output_text = fp.read()
            
        extracted_text_to_output(input_text, output_text, output_file, input_times_list, time_delimiter)
        print('Output with timestamps written to ', output_file)
            
    except:
        print('Exception in extract()')
        traceback.print_exc()

### Example to run 'extract_from_output_text()'

In [8]:
num_sentences_list = [5,10,15,20,25,30,35,40,45,50,60,70,80,90,100]
input_file = 'data/podcast__transcription_test.srt'

for num_sentences in num_sentences_list:
    output_text_file = input_file.split('.')[0] + '_optext_' + str(num_sentences) + '.txt'
    output_file = input_file.split('.')[0] + '_op_' + str(num_sentences) + '.txt'
    
    extract_from_output_text(input_file, output_file, output_text_file, num_sentences)

Exception in extract()
Size of times list:  17425 :Num Sentences =  11
1 2 3 4 5 6 7 8 9 10 Output with timestamps written to  data/podcast__transcription_test_op_10.txt
Size of times list:  17425 :Num Sentences =  16
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 Output with timestamps written to  data/podcast__transcription_test_op_15.txt
Size of times list:  17425 :Num Sentences =  24
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 Output with timestamps written to  data/podcast__transcription_test_op_20.txt
Size of times list:  17425 :Num Sentences =  27
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 Output with timestamps written to  data/podcast__transcription_test_op_25.txt
Size of times list:  17425 :Num Sentences =  33
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 Output with timestamps written to  data/podcast__transcription_test_op_30.txt
Size of times list:  17425 :Num Sentences =  37
1 2 3 4 5 6 7 8 9 10 11 12

Traceback (most recent call last):
  File "<ipython-input-5-407fc54bafa3>", line 12, in extract_from_output_text
    with open(output_text_file, 'r') as fp:
FileNotFoundError: [Errno 2] No such file or directory: 'data/podcast__transcription_test_optext_5.txt'


 45 46 47 Output with timestamps written to  data/podcast__transcription_test_op_45.txt
Size of times list:  17425 :Num Sentences =  51
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 Output with timestamps written to  data/podcast__transcription_test_op_50.txt
Size of times list:  17425 :Num Sentences =  61
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 Output with timestamps written to  data/podcast__transcription_test_op_60.txt
Size of times list:  17425 :Num Sentences =  71
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 Output with timestamps written to  data/podcast__transcription_test_op_70.txt
Size of times list:  

## Run Extractive summarization

In [None]:
def extract_from_srt(input_file, output_file, num_sentences):

    try:
        time_delimiter = '-->'

        input_text_list, input_times_list = subtitle_to_textblob(input_file)

        input_text = ' '.join(input_text_list)

        

        output_text = extractive_summarization(input_text, num_sentences, True)
        
        with open(output_file, 'w') as fp:
            fp.write(output_text)
        
        extracted_text_to_output(input_text, output_text, output_file, input_times_list, time_delimiter)
    except:
        print('Exception in extract()')
        traceback.print_exc()

### Example to run 'extract_from_srt()'

In [None]:
num_sentences_list = [10,15,20,25,30,35,40,45,50,60,70,80,90,100]

ip_file = 'data/podcast__transcription_test.srt'

for num_sentences in num_sentences_list:
    
    op_file = ip_file.split('.')[0] + '_op_' + str(num_sentences) + '.txt'
    
    # print('START ---', op_file)
    extract_from_srt(ip_file, op_file, num_sentences)
    # print('END ---', op_file)

### Sample to run single iteration over num_sentences

In [None]:
'''
ip_file = 'data/AE_Shopify Walkthrough 1.srt'
op_file = 'data/AE_Shopify Walkthrough 1_op.txt'
num_sentences = 10

extract_from_srt(ip_file, op_file, num_sentences)
'''

In [10]:
ip_file = 'data/podcast__transcription_test.srt'

op_1 = 25
op_2 = 20

delimiter = '-->'

output_file1 = input_file.split('.')[0] + '_op_' + str(op_1) + '.txt'
output_file2= input_file.split('.')[0] + '_op_' + str(op_2) + '.txt'

list1 = list()
with open(output_file1, 'r') as fp:
    input_lines = fp.readlines()
    for line in input_lines:
        times = line.split(delimiter)
        list1.append(times[0]+' '+times[1])
        
list2 = list()
with open(output_file2, 'r') as fp:
    input_lines = fp.readlines()
    for line in input_lines:
        times = line.split(delimiter)
        list2.append(times[0]+' '+times[1])
        
print('list1 length: ', len(list1))
print('list2 length: ', len(list2))
print('Common elements', len(set(list1).intersection(list2)))

list1 length:  26
list2 length:  23
Common elements 9


In [None]:
input_text_list, input_times_list = subtitle_to_textblob('data/podcast__transcription_test.srt')
input_text = ' '.join(input_text_list)
print(input_text)