In [1]:
import fasttext
from bs4 import BeautifulSoup
import re
import nltk
import urllib.request as urllib2
import numpy as np
from googlesearch import search
import pandas as pd
import math

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os

In [None]:
def get_urls(topic, section):
    query = f"{topic} {section}"
    
    try:
        urls = [url for url in search(query, num_results = 15, lang = "en")]  
        urls = [url for url in urls if ".org" not in url and ".edu" not in url and "wikipedia" not in url]
    except HTTPError as my_exception:
        print(my_exception.headers)
        return []
    
    ret = []
    itr = 0
    
    while len(ret) != 5:
        
        try:
            hdr = {'User-Agent': 'Mozilla/5.0'}
            req = urllib2.Request(urls[itr], headers = hdr)
            page = urllib2.urlopen(req, timeout = 10)
            soup = BeautifulSoup(page, "html.parser")

            paragraphs_web = soup.findAll("p")
            paragraphs_web = [p.text for p in paragraphs_web]
            
            ret.append(urls[itr])
        except:
            a = 1
            
        itr += 1
        
    return ret
    

In [3]:
topic, section = "machine learning", "history"
urls = get_urls(topic, section)
url = urls[0]

In [4]:
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(url, headers = hdr)
page = urllib2.urlopen(req, timeout = 1)
soup = BeautifulSoup(page, "html.parser")

paragraphs_web = soup.findAll("p")
paragraphs_web = [p.text for p in paragraphs_web]

## Our Method

In [17]:
def clean_text(text):
    cleaned_text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    return cleaned_text.lower()

def get_results_inst(paragraphs, M1, score):

    results = []

    for p in paragraphs:
        sentences = [s for s in nltk.sent_tokenize(p)]
        selected = [False for i in range(len(sentences))]
        sentences_cleaned = [clean_text(s) for s in sentences]

        pred = [(M1.predict(s)[0][0][len("__label__"):][0], M1.predict(s)[1][0]) for s in sentences_cleaned]
        pred = [("I" if idx == "I" and s > score else "N") for idx, s in pred]
        chunk_idxs = [(m.start(0), m.end(0)) for m in re.finditer("I*", "".join(pred))]
        
        
        chunks = []
        for start, end in chunk_idxs:
            if start == end:
                continue
            selected[start:end] = [True for i in range(end - start)]
            chunks.append(sentences[start:end])
        
        chunk_text = [" ".join(c) for c in chunks]
        results.append((p, chunk_text, selected))
        
    return results

def get_summary_inst(paragraphs_web, model, score):
    
    results_web = get_results_inst(paragraphs_web, model, score)
    instances = [r[1] for r in results_web]
    instances = [" ".join(i) for i in instances]
    instances = [i for i in instances if len(i) > 0]
    
    return instances

def get_results_begend(paragraphs, M2_beg, M2_end, score):

    results = []

    for p in paragraphs:
        sentences = [s for s in nltk.sent_tokenize(p)]
        sentences_cleaned = [clean_text(s) for s in sentences]

        selected = [False for i in range(len(sentences))]
        
        beg_pred = [(M2_beg.predict(s)[0][0][len("__label__"):] == 'Beginning_Instance', M2_beg.predict(s)[1][0]) for s in sentences_cleaned]
        end_pred = [(M2_end.predict(s)[0][0][len("__label__"):] == 'Ending_Instance', M2_end.predict(s)[1][0]) for s in sentences_cleaned]

        labels = []

        for i in range(len(beg_pred)):
            beg_lab, beg_score = beg_pred[i]
            end_lab, end_score = end_pred[i]
            
            if max(beg_score, end_score) < score:
                labels.append("N")
                continue
            
            if beg_lab and end_lab:
                labels.append("B" if beg_score >= end_score else "E")
            elif beg_lab:
                labels.append("B")
            elif end_lab:
                labels.append("E")
            else:
                labels.append("N")
        
        chunks = []

        for i in range(len(labels)):        
            if labels[i] == "B":
                new_chunk = [sentences[i]]
                if i != len(labels) - 1 and labels[i + 1] == "E":
                    new_chunk.append(sentences[i + 1]) 
                
                chunks.append(new_chunk)
                
        chunk_text = [" ".join(c) for c in chunks]

        results.append((p, chunk_text, selected))

    return results

def get_summary_begend(paragraphs_web, model1, model2, score):

    results_web = get_results_begend(paragraphs_web, model1, model2, score)
    instances = [r[1] for r in results_web]
    instances = [" ".join(i) for i in instances]
    instances = [i for i in instances if len(i) > 0]
    
    return instances

def get_results_multi(paragraphs, M3):

    results = []

    for p in paragraphs:
        sentences = [s for s in nltk.sent_tokenize(p)]
        selected = [False for i in range(len(sentences))]
        sentences_cleaned = [clean_text(s) for s in sentences]

        pred = [M3.predict(s)[0][0][len("__label__"):][0] for s in sentences_cleaned]
        chunk_idxs = [(m.start(0), m.end(0)) for m in re.finditer("B*M*E*", "".join(pred))]
        
        chunks = []
        for start, end in chunk_idxs:
            if start == end:
                continue
            selected[start:end] = [True for i in range(end - start)]
            chunks.append(sentences[start:end])
        
        chunk_text = [" ".join(c) for c in chunks]
        results.append((p, chunk_text, selected))
        
    return results

def get_summary_multi(paragraphs_web, model):
        
    results_web = get_results_multi(paragraphs_web, model)
    instances = [r[1] for r in results_web]
    instances = [" ".join(i) for i in instances]
    instances = [i for i in instances if len(i) > 0]
    
    return instances

def get_urls(topic, section):
    query = f"{topic} {section}"
    urls = [url for url in search(query, num_results = 15, lang = "en")]  
    urls = [url for url in urls if ".org" not in url and ".edu" not in url and "wikipedia" not in url]
    return urls[:min(5, len(urls))]

def create_summary_data(topic, section, paragraphs_web, score):

    path = f"../Instance Classification/models/{topic}/{section}"

    beg_model = fasttext.load_model(f"{path}/beg_model.bin")
    end_model = fasttext.load_model(f"{path}/end_model.bin")

    inst_model = fasttext.load_model(f"{path}/inst_model.bin")

    multi_model = fasttext.load_model(f"{path}/multi_model.bin")

    s1 = get_summary_begend(paragraphs_web, beg_model, end_model, score)
    s2 = get_summary_inst(paragraphs_web, inst_model, score)
    s3 = get_summary_multi(paragraphs_web, multi_model)

    return s1, s2, s3

In [35]:
s1, s2, s3 = create_summary_data(topic, section, paragraphs_web, 0.97)
fasttext_summary = "\n\n".join(s2)
print(fasttext_summary)

Until the late 1970s, it was a part of AI’s evolution.

Machine learning is a necessary aspect of modern business and research for many organizations today.

The model was created in 1949 by Donald Hebb in a book titled The Organization of Behavior (PDF).

Arthur Samuel of IBM developed a computer program for playing checkers in the 1950s. His design included a scoring function using the positions of the pieces on the board. The program chooses its next move using a minimax strategy, which eventually evolved into the minimax algorithm.

In what Samuel called rote learning, his program recorded/remembered all positions it had already seen and combined this with the values of the reward function. Arthur Samuel first came up with the phrase “machine learning” in 1952.

In 1957, Frank Rosenblatt – at the Cornell Aeronautical Laboratory – combined Donald Hebb’s model of brain cell interaction with Arthur Samuel’s machine learning efforts and created the perceptron. The perceptron was initia







In [7]:
num_sentences = 0
length = 0
for inst in s1:
    length += len(inst)
    sentences = nltk.sent_tokenize(inst)
    num_sentences += len(sentences)
            
print(num_sentences, length)

16 1408


### BERT Extractive Summarizer

In [8]:
from summarizer import Summarizer
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
model_bert = Summarizer()

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
summary = model_bert("\n".join(paragraphs_web), num_sentences = 14)

In [11]:
sent = nltk.sent_tokenize(summary)
bert_summary = "\n\n".join(sent)
print(bert_summary)

We suggest you try the following to help find what you're looking for:
Build, test, and deploy applications by applying natural language processing—for free.

These data sets are so voluminous that traditional data processing software just can’t manage them.

A large part of the value they offer comes from their data, which they’re constantly analyzing to produce more efficiency and develop new products.

The development of open-source frameworks, such as Hadoop (and more recently, Spark) was essential for the growth of big data because they make big data easier to work with and cheaper to store.

The emergence of machine learning has produced still more data.

Finally, big data technology is changing at a rapid pace.

Getting started involves three key actions:
1.

Many people choose their storage solution according to where their data is currently residing.

Analyze
                        Your investment in big data pays off when you analyze and act on your data.

Build data models

## Multi Document Paragraph Selection

In [12]:
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

In [13]:
def clean_lemmatize(text_):

    ret = ""
    cleaned_text = clean_text(text_)
    
    for word in nltk.word_tokenize(cleaned_text):
        if word in stopwords:
            continue
        ret += f"{lemmatizer.lemmatize(word)} "    
    return ret[:-1]

all_text = []

for u in urls:
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = urllib2.Request(u, headers = hdr)
    page = urllib2.urlopen(req, timeout = 10)
    soup = BeautifulSoup(page, "html.parser")

    paragraphs_web = soup.findAll("p")
    paragraphs_web = [p.text for p in paragraphs_web]
    all_text.extend(paragraphs_web)
    
all_text_clean = [clean_lemmatize(t) for t in all_text]

In [14]:
tf_idf = TfidfVectorizer()

In [15]:
all_text_tfidf = tf_idf.fit_transform(all_text_clean)
all_text_tfidf = tf_idf.transform(all_text_clean)

In [16]:
topic_tfidf = tf_idf.transform([clean_lemmatize(f"{topic} {section}")])

In [17]:
data = [(all_text[i], all_text_tfidf[i], len(nltk.sent_tokenize(all_text[i]))) for i in range(len(all_text))]
data.sort(key = lambda item: cosine_similarity(item[1], topic_tfidf)[0][0], reverse = True)

In [18]:
total_len = 0
added = []
itr = 0


while total_len < num_sentences - 6:
    curr_p, curr_vec, curr_num_sent = data[itr]
    
    can_be_added = True
    total_sim = 0
    for p, vec in added:
        if cosine_similarity(vec, curr_vec)[0][0] > 0.5:
            can_be_added = False
            break
            
    if can_be_added:
        added.append((curr_p, curr_vec))
        total_len += len(nltk.sent_tokenize(curr_p))
        
    itr += 1
    
p_out = [a[0] for a in added]

In [19]:
paragraph_summary = "\n\n".join(p_out)
print(paragraph_summary)

Now, let’s learn Big Data definition

What is Big Data?

The definition of big data is data that contains greater variety, arriving in increasing volumes and with more velocity. This is also known as the three Vs.

Big data refers to data that is so large, fast or complex that it’s difficult or impossible to process using traditional methods. The act of accessing and storing large amounts of information for analytics has been around for a long time. But the concept of big data gained momentum in the early 2000s when industry analyst Doug Laney articulated the now-mainstream definition of big data as the three V’s:

Semi-structured data can contain both the forms of data. We can see semi-structured data as a structured in form but it is actually not defined with e.g. a table definition in relational DBMS. Example of semi-structured data is a data represented in an XML file.


In [20]:
path1 = f"summaries/{topic}"
path2 = f"summaries/{topic}/{section}"

if not os.path.exists(path1):
    os.makedirs(path1)
if not os.path.exists(path2):
    os.makedirs(path2)

fasttext_file = open(f"{path2}/fasttext.txt", "w+", encoding = "utf-8")
bert_file = open(f"{path2}/bert.txt", "w+", encoding = "utf-8")
paragraph_file = open(f"{path2}/paragraph.txt", "w+", encoding = "utf-8")

fasttext_file.write(fasttext_summary)
bert_file.write(bert_summary)
paragraph_file.write(paragraph_summary)

fasttext_file.close()
bert_file.close()
paragraph_file.close()