In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from string import punctuation
punctuation = punctuation + '’'
from words_synonyms import words_synonyms

In [4]:
sample_text = "The concept of leadership can be traced back to the very beginning of time in the garden of Eden where the first man, Adam had dominion over every living thing in the garden. Since then, kings have begun to appear. In every gathering, a king was appointed at every rally. Leaders were usually appointed from birth or from God’s instruction. In the Middle Ages, leaders were authoritarian, intimidating, fearful, and rare. As a result of the Renaissance, leadership has moved from the kings alone to parliament which consisted of simple men. This left a question, where did the authority come from? Isn't it from God? Jean-Jacques in 1762 made us know Authority comes from people who give up their personal freedom for their safety and security. Anxiety, robbery, murder was the order of the day for leadership in the Middle Ages. Fear and obedience were planted in the hearts of people between 500 AD and 1300 AD.19th and 20th Century Leadership: The Beginning of the Testimony of the Great Leaders That Emerged Leaders such as Abraham Lincoln of the United States, Giuseppe Mazzini of Italy, Kaiser Wilhelm II of Germany. Scholars have begun to pay more attention to leadership as theories were forming. However, in the 21st century, the emergence of information and communication Technology further evolved the concept of leadership as it provided a platform for greater inclusion in leadership, decision making and planning through globalization."
sample_text_2 = '''Today Apple shared a new report that offers a snapshot of the ways Apple products are empowering people to be at the centre of their health, and acting as an intelligent guardian for their health and safety. Users, developers, medical institutions, and health organisations around the world are using Apple devices, features, and APIs to break down barriers between people and their health information, all while keeping privacy in mind.
Apple’s efforts to 
advance health primarily fall into two categories, which are detailed in two corresponding sections of the report. The first section describes Apple’s focus on personal health and fitness features on Apple Watch and iPhone that offer actionable, science-based insights and help protect users’ health and safety. The second section shares Apple’s work with the medical community to support research and care. Both sections — along with an Extensions and Spotlights section at the report’s end — include a variety of examples of third-party developers, health institutions, and organisations innovating with Apple technology.
“We believe passionately that technology can play a role in improving health outcomes and encouraging people to live a healthier day, and we are excited about the many ways users are benefiting from our health and fitness features, and by the ways third-party developers, institutions, and organisations are using Apple technology to advance health and science,” said Jeff Williams, Apple’s chief operating officer. “Our vision for the future is to continue to create science-based technology that equips people with even more information and acts as an intelligent guardian for their health, so they’re no longer passengers on their own health journey. Instead, we want people to be firmly in the driver’s seat with meaningful, actionable insights.”
'''

In [5]:
class Summarizer():
    '''
    Class containing functions to clean, format and summarize text
    Parameters:
    > self : text, type - string
    > self : churn_level float specifying percentage of original content to capture
    '''


    def __init__ (self, text, churn_level):
        self.text = text
        self.churn_level = float(churn_level)
        self.text_topic = '' # List that will contain string topics.

    
    def word_sentence_tokenizer(self):
        """ 
        This function breaks text into word and sentence tokens   
        
        Parameters:

        > self : text, type -string
        > self : churn_level float specifying percentage of original content to capture

        return:

        (sent_tokens, word_tokens) : Tuple containing sentences (sentence tokens) and
        texts (text tokens) contained in text provided
        """

        sent_tokens = sent_tokenize(self.text, 'english')
        word_tokens = word_tokenize(self.text, 'english')
        return(sent_tokens, word_tokens)

    
    def word_count_vec(self, word_tokens):
        '''
         This function produces a dictionary containing the normalized scores of each word tokens in a list
         
         Parameters:
         
         > word_tokens = [] # List of words
         
         return:

         word_frequency_scores : Dictionary of word tokens and their normalized scores

        '''
        clean_words = []
        word_frequency_scores = {}

        # Looping through to calculate word frequencies
        for word in word_tokens:
            if word.strip().lower() not in stop_words:
                if word not in punctuation:
                    clean_words.append(word)
                    if word not in word_frequency_scores:
                        word_frequency_scores[word] = 1
                    else:
                        word_frequency_scores[word] += 1
        
        # Looping through to normalize word_frequency_scores using linear / minmax scaler
        max_frequency = max(word_frequency_scores.values())
        min_frequency = min(word_frequency_scores.values())
        for word in word_frequency_scores.keys():
            word_frequency_scores[word] = (word_frequency_scores[word] - min_frequency) / (max_frequency - min_frequency)

        topic = max(word_frequency_scores, key=word_frequency_scores.get)
        self.text_topic += topic
        return(word_frequency_scores)
    

    def sentence_scoring(self, sentence_tokens, word_frequency_scores):
        '''
        This function calculates scores for each sentence and returns a dictionary containing sentence, score and order.
        
        Parameters:

        > sentence_tokens: List containing sentence tokens
        > word_frequency_scores: Dictionary containing word tokens and their (normalized) scores

        return:

        sentence_scores : Dictionary of sentences and their scores.

        '''
        sentence_scores = {}
        for sentence in sentence_tokens:
            for word in word_tokenize(sentence, 'english'):
                if word.lower() in word_frequency_scores.keys():
                    if sentence not in sentence_scores.keys():
                        sentence_scores[sentence] = word_frequency_scores[word.lower()]
                    else:
                        sentence_scores[sentence] += word_frequency_scores[word.lower()]
        return(sentence_scores)

        
    def summary_sorting(self, sentence_scores):
        '''
        This function selects the top n sentences based on the sentence scores
        then organizes the final sentence in asccending order of how they appeared in original text

        Parameters:

        self:churn_level - percentage of original content to capture
        sentence_scores - Dictionary containing sentences and their scores.

        return:

        final_summary : String of final / formatted summary output.
        '''

        order_sorted_sentences = []
        score_sorted_sentences = []
        sentence_score_order_tuples = []

        # multiplying churn level by number of sentences then converting to integer 
        top_n_sentences = int(self.churn_level * len(sentence_scores.keys()))

        order = 1
        # sort all sentences in descending order of their sentence_score values
        for sentence, score in sentence_scores.items():
            sentence_score_order_tuples.append((sentence, score, order))
            order += 1
        score_sorted_sentences = sorted(sentence_score_order_tuples, key=lambda tup: tup[1], reverse=True)
        # Slicing from first to top_n_sentences and appending result to produce final summary.
        top_n_slice = score_sorted_sentences[0:top_n_sentences]
        order_sorted_sentences = sorted(top_n_slice, key=lambda tup: tup[2], reverse=False)
        final_summary_list = [sentence[0] for sentence in order_sorted_sentences]
        final_sorted_summary_string = ' '.join(final_summary_list)
        return(final_sorted_summary_string)


def extract_txt(document):
    """
    Function to extract text from .txt file extension document

    Parameters:
    
    > Document with file extension .txt
    
    return:
    
    full_text_string : String of text contained in the .txt document provided
    """
    with open(document) as text:
        full_text_string = text.read().replace("\n", '')
        return(full_text_string)
    

def string_synonym_swap(text):
    """
    This function converts strings to their synonyms    
    It also returns text containing CAPITAL letters or ending with 's', 'ing' , 'ed' as they are,
    including some specified texts whose synonyms are relative to how they appear in sentences.

    Parameters:
    
    > text_list : Strings to be converted to synonyms

    return:

    > test_synonyms : Synonym converted string of text provided. 
    """
    synonyms = [] # final list of synonyms with first index
    text_list = text.split()
    
    for text in text_list:
        try:
            if text.islower() and len(text) >= 3:
                synonyms.append(words_synonyms[text])
            elif text in stop_words or text in punctuation or len(text) <3:
                synonyms.append(words_synonyms[text])
            else:
                synonyms.append(text)
        except Exception:
            synonyms.append(text)
        
        # Loops through each token, checks if the token is a punctuation. if it is not a punctuation, it appends the token with a space before to the string-text body
        # if the token is a punctuation, it appens the token to the text body without a space before.
        # 'what is that?' will appear as 'what is that ? ' if this for loop didn't exist.
        string = ''
        for token in synonyms:
            if token not in punctuation:
                string += ' '+token
            else:
                string += token 
    return(string.strip())

In [6]:
sum_text = sample_text_2

a = Summarizer(sum_text, 0.6)
words = a.word_sentence_tokenizer()
words_ = words[1]
sentence = words[0]
token_scores = a.word_count_vec(words_)
test = a.sentence_scoring(sentence, token_scores)

fine = a.summary_sorting(test)
final = string_synonym_swap(fine)

# print(len(fine))
# print(len(sample_text_2))
print(f'The topic of this body of text is "{a.text_topic}".', '\n')
print('Original text')
print(sum_text)
print('\n Length of original text: ', len(sum_text))
print('\n Proper summay \n')
print(final)
print('length of summary: ', len(final))

# how to print the topic
# print(a.text_topic)#####################################################

The topic of this body of text is "health". 

Original text
Today Apple shared a new report that offers a snapshot of the ways Apple products are empowering people to be at the centre of their health, and acting as an intelligent guardian for their health and safety. Users, developers, medical institutions, and health organisations around the world are using Apple devices, features, and APIs to break down barriers between people and their health information, all while keeping privacy in mind.
Apple’s efforts to 
advance health primarily fall into two categories, which are detailed in two corresponding sections of the report. The first section describes Apple’s focus on personal health and fitness features on Apple Watch and iPhone that offer actionable, science-based insights and help protect users’ health and safety. The second section shares Apple’s work with the medical community to support research and care. Both sections — along with an Extensions and Spotlights section at the rep