In [95]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from string import punctuation
punctuation = punctuation + '’'

In [96]:
sample_text = "The concept of leadership can be traced back to the very beginning of time in the garden of Eden where the first man, Adam had dominion over every living thing in the garden. Since then, kings have begun to appear. In every gathering, a king was appointed at every rally. Leaders were usually appointed from birth or from God’s instruction. In the Middle Ages, leaders were authoritarian, intimidating, fearful, and rare. As a result of the Renaissance, leadership has moved from the kings alone to parliament which consisted of simple men. This left a question, where did the authority come from? Isn't it from God? Jean-Jacques in 1762 made us know Authority comes from people who give up their personal freedom for their safety and security. Anxiety, robbery, murder was the order of the day for leadership in the Middle Ages. Fear and obedience were planted in the hearts of people between 500 AD and 1300 AD.19th and 20th Century Leadership: The Beginning of the Testimony of the Great Leaders That Emerged Leaders such as Abraham Lincoln of the United States, Giuseppe Mazzini of Italy, Kaiser Wilhelm II of Germany. Scholars have begun to pay more attention to leadership as theories were forming. However, in the 21st century, the emergence of information and communication Technology further evolved the concept of leadership as it provided a platform for greater inclusion in leadership, decision making and planning through globalization."

In [97]:
class Summarizer():
    '''
    Class containing functions to clean, format and summarize text
    > self:text, type - string
    > self:churn_level float specifying percentage of original content to capture
    '''


    def __init__ (self, text, churn_level):
        self.text = text
        self.churn_level = float(churn_level)

    
    def word_sentence_tokenizer(self):
        """ 
        This function breaks text into word and sentence tokens
        Parameters:

        > self:text, type -string
        > self:churn_level float specifying percentage of original content to capture

        return:
        (sent_tokens, word_tokens) : Tuple containing sentences (sentence tokens) and
        texts (text tokens) contained in text provided
        """

        sent_tokens = sent_tokenize(self.text, 'english')
        word_tokens = word_tokenize(self.text, 'english')
        return(sent_tokens, word_tokens)

    
    def word_count_vec(self, word_tokens):
        '''
         This function produces a dictionary containing the normalized scores of each word tokens in a list
         
         Parameters:
         > word_tokens = [] # List of words
         
         return:
         word_frequency_scores : Dictionary of word tokens and their normalized scores
        '''
        clean_words = []
        word_frequency_scores = {}

        # Looping through to calculate word frequencies
        for word in word_tokens:
            if word.lower() not in stop_words:
                if word not in punctuation:
                    clean_words.append(word)
                    if word not in word_frequency_scores:
                        word_frequency_scores[word] = 1
                    else:
                        word_frequency_scores[word] += 1
        
        # Looping through to normalize word_frequency_scores using linear / minmax scaler
        max_frequency = max(word_frequency_scores.values())
        min_frequency = min(word_frequency_scores.values())
        for word in word_frequency_scores.keys():
            word_frequency_scores[word] = (word_frequency_scores[word] - min_frequency) / (max_frequency - min_frequency)
        return(word_frequency_scores)
    

    def sentence_scoring(self, sentence_tokens, word_frequency_scores):
        '''
        This function calculates scores for each sentence and returns a dictionary containing sentence, score and order.
        
        Parameters:
        > sentence_tokens: List containing sentence tokens
        > word_frequency_scores: Dictionary containing word tokens and their (normalized) scores

        return:
        sentence_scores : Dictionary of sentences and their scores.

        '''
        sentence_scores = {}
        for sentence in sentence_tokens:
            for word in word_tokenize(sentence, 'english'):
                if word.lower() in word_frequency_scores.keys():
                    if sentence not in sentence_scores.keys():
                        sentence_scores[sentence] = word_frequency_scores[word.lower()]
                    else:
                        sentence_scores[sentence] += word_frequency_scores[word.lower()]
        return(sentence_scores)

        
    def summary_sorting(self, sentence_scores):
        '''
        This function selects the top n sentences based on the sentence scores
        then organizes the final sentence in order of they appear in original text

        Parameters:
        self:churn_level - percentage of original content to capture
        sentence_scores - Dictionary containing sentences and their scores.

        return:
        final_summary : String of final / formatted summary output.
        '''

        order_sorted_sentences = []
        score_sorted_sentences = []
        sentence_score_order_tuples = []

        # multiplying churn level by number of sentences then converting to integer 
        top_n_sentences = int(self.churn_level * len(sentence_scores.keys()))

        order = 1
        # sort all sentences in descending order of their sentence_score values
        for sentence, score in sentence_scores.items():
            sentence_score_order_tuples.append((sentence, score, order))
            order += 1
        score_sorted_sentences = sorted(sentence_score_order_tuples, key=lambda tup: tup[1], reverse=True)
        # Slicing from first to top_n_sentences and appending result to produce final summary.
        top_n_slice = score_sorted_sentences[0:top_n_sentences]
        order_sorted_sentences = sorted(top_n_slice, key=lambda tup: tup[2], reverse=False)
        final_summary_list = [sentence[0] for sentence in order_sorted_sentences]
        final_summary = ' '.join(final_summary_list)
        return(final_summary)

In [98]:
# a = Summarizer(sample_text, 0.4)
# words = a.word_sentence_tokenizer()
# words_ = words[1]
# sentence = words[0]
# token_scores = a.word_count_vec(words_)
# test = a.sentence_scoring(sentence, token_scores)

# fine = a.summary_sorting(test)
# print(fine)