In [68]:
# followed this tutorial:
# https://blog.floydhub.com/gentle-introduction-to-text-summarization-in-machine-learning/  

import nltk

In [97]:
# Step 1: Preparing the data

import bs4 as BeautifulSoup
import urllib.request  

# Fetching the content from the URL
fetched_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/12_Angry_Men_(1957_film)')


article_read = fetched_data.read()

# Parsing the URL content and storing in a variable
article_parsed = BeautifulSoup.BeautifulSoup(article_read,'html.parser')

# Returning <p> tags
paragraphs = article_parsed.find_all('p')

article_content = ''

# Looping through the paragraphs and adding them to the variable
for p in paragraphs:  
    article_content += p.text


In [87]:

test_html_split = "https://en.wikipedia.org/wiki/12_Angry_Men_(1957_film)".rsplit('/', 1)[-1]  



In [70]:
# we've scraped article content as seen below:
print(type(article_content))
print(article_content)

<class 'str'>

12 Angry Men is a 1957 American courtroom drama film directed by Sidney Lumet, adapted from a teleplay of the same name by Reginald Rose.[6][7] This courtroom drama tells the story of a jury of 12 men as they deliberate the conviction or acquittal of a defendant on the basis of reasonable doubt, forcing the jurors to question their morals and values. In the United States, a verdict in most criminal trials by jury must be unanimous. The defendant is a 19-year-old male and the witnesses are the lady across the street and the old man.
12 Angry Men explores many techniques of consensus-building and the difficulties encountered in the process among this group of men whose range of personalities adds to the intensity and conflict. It also explores the power one person has to elicit change. No names are used in the film; the jury members are identified by number until two members exchange names at the end. The film forces the characters and audience to evaluate their own self-i

In [71]:
# Step 2: Processing the data
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
def _create_dictionary_table(text_string) -> dict:
   
    # Removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    # Reducing words to their root form
    stem = PorterStemmer()
    
    # Creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table

In [72]:
# the output here will be a dictionary of words with their corresponding frequencies
_create_dictionary_table(article_content)


{'12': 17,
 'angri': 12,
 'men': 14,
 '1957': 2,
 'american': 7,
 'courtroom': 4,
 'drama': 10,
 'film': 45,
 'direct': 4,
 'sidney': 2,
 'lumet': 4,
 ',': 142,
 'adapt': 6,
 'teleplay': 1,
 'name': 6,
 'reginald': 2,
 'rose': 4,
 '.': 94,
 '[': 30,
 '6': 3,
 ']': 30,
 '7': 2,
 'thi': 2,
 'tell': 3,
 'stori': 5,
 'juri': 20,
 'deliber': 3,
 'convict': 2,
 'acquitt': 1,
 'defend': 9,
 'basi': 1,
 'reason': 5,
 'doubt': 5,
 'forc': 3,
 'juror': 55,
 'question': 4,
 'moral': 1,
 'valu': 1,
 'In': 8,
 'unit': 3,
 'state': 5,
 'verdict': 4,
 'crimin': 1,
 'trial': 2,
 'must': 1,
 'unanim': 2,
 '19-year-old': 1,
 'male': 1,
 'wit': 5,
 'ladi': 1,
 'across': 2,
 'street': 2,
 'old': 1,
 'man': 1,
 'explor': 2,
 'mani': 1,
 'techniqu': 2,
 'consensus-build': 1,
 'difficulti': 1,
 'encount': 1,
 'process': 1,
 'among': 1,
 'group': 2,
 'whose': 2,
 'rang': 1,
 'person': 4,
 'add': 2,
 'intens': 1,
 'conflict': 1,
 'It': 5,
 'also': 9,
 'power': 2,
 'one': 9,
 'ha': 3,
 'elicit': 1,
 'chang': 9,

In [73]:
# Step 3: Tokenizing the article into sentences

# To split the article_content into a set of sentences, we’ll use the built-in method from the nltk library.

from nltk.tokenize import word_tokenize, sent_tokenize


sentences = sent_tokenize(article_content)

In [74]:
print(type(sentences))
print(sentences)

<class 'list'>
['\n12 Angry Men is a 1957 American courtroom drama film directed by Sidney Lumet, adapted from a teleplay of the same name by Reginald Rose.', '[6][7] This courtroom drama tells the story of a jury of 12 men as they deliberate the conviction or acquittal of a defendant on the basis of reasonable doubt, forcing the jurors to question their morals and values.', 'In the United States, a verdict in most criminal trials by jury must be unanimous.', 'The defendant is a 19-year-old male and the witnesses are the lady across the street and the old man.', '12 Angry Men explores many techniques of consensus-building and the difficulties encountered in the process among this group of men whose range of personalities adds to the intensity and conflict.', 'It also explores the power one person has to elicit change.', 'No names are used in the film; the jury members are identified by number until two members exchange names at the end.', 'The film forces the characters and audience to

In [75]:
# Step 4: Finding the weighted frequencies of the sentences

def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    # Algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] /        sentence_wordcount_without_stop_words
      
    return sentence_weight

In [76]:
# because _calculate_sentence_scores takes a frequency table as an input
# we need to define a frequency table here:
frequency_table_of_article_content = _create_dictionary_table(article_content)

# now let's input our frequency table along with our sentences
_calculate_sentence_scores(sentences, frequency_table_of_article_content)

# this is how we're evaluating the score for every sentence in the text.
# we've analyzed the frequency of occurence of each term. 
# In this case, we’ll be scoring each sentence by its words; that is, adding the frequency of each important word found in the sentence.



{'\n12 Ang': 13.517241379310345,
 '[6][7] ': 14.393939393939394,
 'In the ': 19.850267379679142,
 'The def': 8.533333333333333,
 '12 Angr': 21.521825396825395,
 'It also': 12.363636363636363,
 'No name': 12.0,
 'The fil': 34.00017981150794,
 'Only th': 13.75,
 'In 2007': 17.095238095238095,
 '[8] The': 10.878787878787879,
 '[10]\nIn': 13.448275862068966,
 'If ther': 31.875,
 'If foun': 30.625,
 '[11]\nIn': 39.2,
 'He ques': 14.80952380952381,
 'Juror 8': 15.195203801685283,
 'Having ': 19.764705882352942,
 'The bal': 20.5,
 'Juror 3': 14.972937091503269,
 'Juror 5': 19.142857142857146,
 'Jurors ': 30.21701887855734,
 '", and ': 23.2,
 'Juror 4': 13.16842105263158,
 'Juror 2': 16.12,
 'Impatie': 14.576923076923077,
 'Juror 1': 24.65277777777778,
 'The oth': 25.384615384615383,
 'Juror 9': 18.31578947368421,
 'Other j': 24.0,
 'He tear': 21.615384615384617,
 'As the ': 31.818181818181817,
 'Outside': 15.791666666666666,
 'Reginal': 12.783783783783784,
 'A compl': 15.55,
 'It was ': 18.57

In [77]:
# Step 5: Calculating the threshold of the sentences

def _calculate_average_score(sentence_weight) -> int:
   
    # Calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    # Getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score



In [78]:
# create a variable that is the calculated sentence weight
sentence_weight_calculated = _calculate_sentence_scores(sentences, frequency_table_of_article_content)  


# now perform this function on that in order to calculate the average score

threshold_of_sentences = _calculate_average_score(sentence_weight_calculated)



In [79]:
# Step 6: Getting the Summary

def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary


In [80]:
# This output is now a summary of a wikipedia page!

article_summary = _get_article_summary(sentences, sentence_weight_calculated, threshold_of_sentences)


In [81]:
print(type(article_summary))
print(article_summary)

<class 'str'>
 In the United States, a verdict in most criminal trials by jury must be unanimous. 12 Angry Men explores many techniques of consensus-building and the difficulties encountered in the process among this group of men whose range of personalities adds to the intensity and conflict. The film forces the characters and audience to evaluate their own self-image through observing the personality, experiences, and actions of the jurors. The film is notable for its almost exclusive use of one set. In 2007, the film was selected for preservation in the United States National Film Registry by the Library of Congress as being "culturally, historically, or aesthetically significant". If there is any reasonable doubt, they are to return a verdict of not guilty. If found guilty, he will receive a death sentence. [11]
In a preliminary vote, all jurors vote "guilty" except Juror 8. Having hung the jury, Juror 8 suggests a secret ballot – if everyone is still agreed, he will acquiesce. The

In [98]:
with open("_article.txt", "w") as text_file:
    text_file.write(article_summary)

NameError: name 'sys' is not defined