In [None]:
import nltk
from nltk.tokenize import sent_tokenize
text='Now, assume we have 10 million documents and the word apple appears in one thousand of these.'
sentences = sent_tokenize(text) # NLTK function
total_documents = len(sentences)
total_documents

In [None]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()
 
    for sent in sentences:
        freq_table = {}
        words = nltk.word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue
 
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
 
        frequency_matrix[sent[:15]] = freq_table
 
    return frequency_matrix
 
fm=_create_frequency_matrix(sentences)
fm

{'Now, assume we ': {',': 1,
  '.': 1,
  '10': 1,
  'appear': 1,
  'appl': 1,
  'assum': 1,
  'document': 1,
  'million': 1,
  'one': 1,
  'thousand': 1,
  'word': 1}}

In [None]:
def _create_tf_matrix(fm):
    tf_matrix = {}
    
    for sent, f_table in fm.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix
t=_create_tf_matrix(fm)
t

{'Now, assume we ': {',': 0.09090909090909091,
  '.': 0.09090909090909091,
  '10': 0.09090909090909091,
  'appear': 0.09090909090909091,
  'appl': 0.09090909090909091,
  'assum': 0.09090909090909091,
  'document': 0.09090909090909091,
  'million': 0.09090909090909091,
  'one': 0.09090909090909091,
  'thousand': 0.09090909090909091,
  'word': 0.09090909090909091}}

In [None]:
def _create_documents_per_words(fm):
    word_per_doc_table = {}

    for sent, f_table in fm.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table
cd=_create_documents_per_words(fm)

In [None]:
import math
def _create_idf_matrix(fm,cd, total_documents):
    idf_matrix = {}

    for sent, f_table in fm.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(cd[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix
_create_idf_matrix(fm, cd, total_documents)

{'Now, assume we ': {',': 0.0,
  '.': 0.0,
  '10': 0.0,
  'appear': 0.0,
  'appl': 0.0,
  'assum': 0.0,
  'document': 0.0,
  'million': 0.0,
  'one': 0.0,
  'thousand': 0.0,
  'word': 0.0}}

In [None]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix
_create_tf_idf_matrix(t, idf_matrix)

{'Now, assume we ': {',': 0.0,
  '.': 0.0,
  '10': 0.0,
  'appear': 0.0,
  'appl': 0.0,
  'assum': 0.0,
  'document': 0.0,
  'million': 0.0,
  'one': 0.0,
  'thousand': 0.0,
  'word': 0.0}}

In [None]:
def _score_sentences(tf_idf_matrix) -> dict:
    
 
    sentenceValue = {}
 
    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0
 
        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score
 
        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence
 
    return sentenceValue
s=_score_sentences(tf_idf_matrix)
s

{'Now, assume we ': 0.0}

In [None]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]
 
    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))
 
    return average
g=_find_average_score(s)

In [None]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''
 
    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1
 
    return summary
_generate_summary(sentences, s, g)

' Now, assume we have 10 million documents and the word apple appears in one thousand of these.'