# EXTRACTIVE SUMMARIZATION

#### Dependencies

* To avoid dependency issues, install the following versions

Python = 3.6.9 <br>
torch==1.7.0 <br> 
spacy==2.3.1 <br>
bert-extractive-summarizer <br>

In [54]:
from summarizer import Summarizer
import traceback 
import re

### Parsing SRT files

In [7]:
def subtitle_to_textblob(subtitle_file):

    input_text_list = list()
    input_times_list = list()

    count = 0
    with open(subtitle_file, 'r') as fp:
        input_lines = fp.readlines()
        for line in input_lines:
            line = line.strip()

            # print('Count ', count)
            if (line):
                # Process line numbers
                if (count == 0):
                    count += 1
                elif (count == 1):
                    input_times_list.append(line)
                    count += 1
                elif (count == 2):
                    input_text_list.append(line)
                    count = 0
    return input_text_list, input_times_list

In [8]:
def extractive_summarization(input_text, num_sentences, debug=False):

    model = Summarizer()
    output_text = model(input_text, num_sentences=num_sentences-1)
    
    if (debug):        
        print('----------------- TOP',str(num_sentences),'SENTENCES -----------------')
        print(output_text)
        print('----------------------------------------------------')
        
    return output_text

In [61]:
def extracted_text_to_output(input_text, output_text, output_file, input_times_list, time_delimiter):
    
    try:
        extracted_sentences = re.split(r'[.!?\n]\s*',output_text.strip())
        # print('Size of times list: ', len(input_times_list))

        with open(op_file, 'w') as fp:

            for sentence in extracted_sentences:

                sentence = sentence.strip()

                if (sentence):

                    # print(sentence)
                    search_list = list(sentence.split())

                    end_char_index = input_text.find(sentence)
                    start_word_index = len(input_text[:end_char_index].split())
                    end_word_index = start_word_index + len(search_list)-1
                    
                    # print(end_char_index, start_word_index, end_word_index)
                    
                    # print(start_word_index, end_word_index)

                    start_ip_time = input_times_list[start_word_index].split(time_delimiter)[0].strip()
                    # print(start_ip_time)

                    end_ip_time = input_times_list[end_word_index].split(time_delimiter)[1].strip()
                    # print(end_ip_time)

                    fp.write(start_ip_time+time_delimiter+end_ip_time+time_delimiter+sentence+'\n')
    except:
        print('Exception in extracted_text_to_output()')
        traceback.print_exc()

In [39]:
def extract_from_output_text(input_file, output_file, output_text_file, num_sentences):

    try:
        time_delimiter = '-->'

        input_text_list, input_times_list = subtitle_to_textblob(input_file)

        input_text = ' '.join(input_text_list)
        # print(input_text)
        output_text = ''
        
        with open(output_text_file, 'r') as fp:
            output_text = fp.read()
            
        extracted_text_to_output(input_text, output_text, output_file, input_times_list, time_delimiter)
        print('Output with timestamps written to ', output_file)
            
    except:
        print('Exception in extract()')
        traceback.print_exc()

### Example to run 'extract_from_output_text()'

In [62]:
num_sentences_list = [15,20,25,30,35,40,45,50,60,70,80,90,100]
input_file = 'data/podcast__transcription_test.srt'

for num_sentences in num_sentences_list:
    output_text_file = input_file.split('.')[0] + '_optext_' + str(num_sentences) + '.txt'
    output_file = input_file.split('.')[0] + '_op_' + str(num_sentences) + '.txt'
    
    extract_from_output_text(input_file, output_file, output_text_file, num_sentences)

Output with timestamps written to  data/podcast__transcription_test_op_15.txt
Output with timestamps written to  data/podcast__transcription_test_op_20.txt
Output with timestamps written to  data/podcast__transcription_test_op_25.txt
Output with timestamps written to  data/podcast__transcription_test_op_30.txt
Output with timestamps written to  data/podcast__transcription_test_op_35.txt
Output with timestamps written to  data/podcast__transcription_test_op_40.txt
Output with timestamps written to  data/podcast__transcription_test_op_45.txt
Output with timestamps written to  data/podcast__transcription_test_op_50.txt
Output with timestamps written to  data/podcast__transcription_test_op_60.txt
Output with timestamps written to  data/podcast__transcription_test_op_70.txt
Output with timestamps written to  data/podcast__transcription_test_op_80.txt
Output with timestamps written to  data/podcast__transcription_test_op_90.txt
Output with timestamps written to  data/podcast__transcription_t

## Extract 

In [18]:
def extract_from_srt(input_file, output_file, num_sentences):

    try:
        time_delimiter = '-->'

        input_text_list, input_times_list = subtitle_to_textblob(input_file)

        input_text = ' '.join(input_text_list)

        

        output_text = extractive_summarization(input_text, num_sentences, True)
        
        with open(output_file, 'w') as fp:
            fp.write(output_text)
        
        extracted_text_to_output(input_text, output_text, output_file, input_times_list, time_delimiter)
    except:
        print('Exception in extract()')
        traceback.print_exc()

In [None]:
num_sentences_list = [10,15,20,25,30,35,40,45,50,60,70,80,90,100]

ip_file = 'data/podcast__transcription_test.srt'

for num_sentences in num_sentences_list:
    
    op_file = ip_file.split('.')[0] + '_op_' + str(num_sentences) + '.txt'
    
    # print('START ---', op_file)
    extract_from_srt(ip_file, op_file, num_sentences)
    # print('END ---', op_file)

### Sample to run single iteration over num_sentences

In [19]:
'''
ip_file = 'data/AE_Shopify Walkthrough 1.srt'
op_file = 'data/AE_Shopify Walkthrough 1_op.txt'
num_sentences = 10

extract_from_srt(ip_file, op_file, num_sentences)
'''

"\nip_file = 'data/AE_Shopify Walkthrough 1.srt'\nop_file = 'data/AE_Shopify Walkthrough 1_op.txt'\nnum_sentences = 10\n\nextract(ip_file, op_file, num_sentences)\n"