In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
#Because the CSV file is not rectangular, Pandas won't load the file without specifiying column names.
#I will name the columns 'c0','c1','c2'...,'c3391' (the longest row in the CSV file has 3392 columns).
df = pd.read_csv('wikipedia_machine_learning.csv', names=['c' + str(i) for i in range(3392)], dtype='str', encoding='utf-8')\
       .fillna('')
    
#All of the columns for a given row will be concatenated into a single string, 'columns_combined'.
df['columns_combined'] = ''
for j in range(df.shape[1]-1):
    df['columns_combined'] += df.loc[:, 'c'+str(j)]
    
#Save all of the Wikipedia data as a string list of length 7318 for later user.
wiki = df['columns_combined'].tolist()

In [3]:
def article_title(string):
    """
    Returns the title of the Wikipedia article in df, usually embedded in the first column.
    """
    #The article titles are at the beginning of the string, before the first '\t'. Also, title is saved as a list, hence the [0].
    title = re.findall(r'.+?\t', string)[0]
    #We don't want to return the \t, so we eliminate the last character.
    return title[:len(title)-1]

#Add the new 'article_title' variable to df.
#Usually the article title is in c0. However, it sometimes spills over to c1 and c2 as well.
df['article_title'] = (df['c0'] + df['c1'] + df['c2']).apply(article_title)

**LexRank Summarizer**

In [4]:
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

def lex_rank_summarizer(string, n=1):
    """
    For text summarization; this returns the top n most relevant sentences from a string of sentences. It uses LexRank, which is
    an algorithm that uses word2vec to compute the pairwise similarities between each of the sentences and then ranks them.
    """
    parser = PlaintextParser.from_string(string,Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, n)   
    sentence_list = [str(summary[i]) for i in range(n)]
    
    return sentence_list


#Example of lex_rank_summarizer() usage: the 3 most relevant sentences.
article_title = df.loc[7317, 'article_title']
print('Wikipedia article title: ' + article_title)

doc = wiki[7317] #The Wikipedia article with row index of 7317.
n = 3
top_n_sentences = lex_rank_summarizer(doc, n)
print('Top ' + str(n) + ' sentences: ' + '\n')
for i in range(n):
    print(str(i+1) + ') ' + top_n_sentences[i] + '\n')

Wikipedia article title: Random close pack
Top 3 sentences: 

1) Random close pack	https://en.wikipedia.org/wiki/Random_close_pack	"Random close packing(RCP) is an empirical parameter used to characterize the maximum volume fraction of solid objects obtained when they are packed randomly.

2) But shaking cannot increase the density indefinitely a limit is reached and if this is reached without obvious packing into a regular crystal lattice this is the empirical random close-packed density.

3) The definition of packing fraction can be given as: ""the volume taken by number of particles in a given space of volume"".



**Luhn Summarizer**

In [5]:
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer

def luhn_summarizer(string, n=1):
    """
    For text summarization; this returns the top n most relevant sentences from a string of sentences. 
    It uses Luhn, a heuristic method.
    """    
    parser = PlaintextParser.from_string(string,Tokenizer("english"))
    summarizer_luhn = LuhnSummarizer()
    summary = summarizer_luhn(parser.document,n)
    sentence_list = [str(summary[i]) for i in range(n)]
    
    return sentence_list


#Example of luhn_summarizer() usage: the 3 most relevant sentences.
article_title = df.loc[7317, 'article_title']
print('Wikipedia article title: ' + article_title)

doc = wiki[7317] #The Wikipedia article with row index of 7317.
n = 3
top_n_sentences = luhn_summarizer(doc, n)
print('Top ' + str(n) + ' sentences: ' + '\n')
for i in range(n):
    print(str(i+1) + ') ' + top_n_sentences[i] + '\n')

Wikipedia article title: Random close pack
Top 3 sentences: 

1) For example when a solid container is filled with grain shaking the container will reduce the volume taken up by the objects thus allowing more grain to be added to the container.

2) But shaking cannot increase the density indefinitely a limit is reached and if this is reached without obvious packing into a regular crystal lattice this is the empirical random close-packed density.

3) Thus RCP is the packing fraction given by the limit as the tapping amplitude goes to zero and the limit as the number of taps goes to infinity.



**Latent Semantic Analysis Summarizer**

In [6]:
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

def lsa_summarizer(string, n=1):
    """
    For text summarization; this returns the top n most relevant sentences from a string of sentences. 
    It uses Latent Semantic Analysis.
    """    
    parser = PlaintextParser.from_string(string,Tokenizer("english"))
    summarizer_lsa = LsaSummarizer()
    summary = summarizer_lsa(parser.document,n)
    sentence_list = [str(summary[i]) for i in range(n)]
    
    return sentence_list

#Example of lsa_summarizer() usage: the 3 most relevant sentences.
article_title = df.loc[7317, 'article_title']
print('Wikipedia article title: ' + article_title)

doc = wiki[2019] #The Wikipedia article with row index of 7317.
n = 3
top_n_sentences = lsa_summarizer(doc, n)
print('Top ' + str(n) + ' sentences: ' + '\n')
for i in range(n):
    print(str(i+1) + ') ' + top_n_sentences[i] + '\n')

Wikipedia article title: Random close pack
Top 3 sentences: 

1) While a conventional CCA generalizes principal component analysis(PCA) to two sets of random variables a gCCA generalizes PCA to more than two sets of random variables.

2) They can always be made to vanish by introducing a new regression parameter for each common factor.

3) Strother S.C.; Soltanian-Zadeh H.(2012) ""Enhancing reproducibility of fMRI statistical maps using generalized canonical correlation analysis in NPAIRS framework"" NeuroImage 60(4): 1970–1981.



**TextRank**

In [7]:
#gensim is for topic modeling and similarity retrieval.
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords 

#Used for retrieving text from a webpage.
from bs4 import BeautifulSoup
from urllib.request import urlopen
 
def text_rank_summarizer(url, n=10):
    """ 
    This returns the n most relevant terms from a webpage, unless there are less than n, in which case all are returned.
    The terms are in descending order of relevance. Note that a group of words with a common root can be returned but only 
    count as 1, so the returned list can actually be longer than n.
    """
    page = urlopen(url)
    soup = BeautifulSoup(page, "lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    
    #We need to count the number of terms so that we can use it in the min() function. Otherwise, if we input a value of
    #n which is too large for the given URL then an error will be returned.
    number_terms = len(keywords(str(text)).split('\n'))
    
    #Retrieving the relevant terms, saved as a list.
    relevant_terms = keywords(str(text), words=min(n, number_terms)).split('\n')
    
    return relevant_terms


#Example of relevant_terms() usage. Note that the terms are lists in descending order of relevance.
text_rank_summarizer("https://en.wikipedia.org/wiki/Calculus", n=10)

['calculus',
 'functions',
 'function',
 'infinitesimal',
 'infinitesimals',
 'infinitesimally',
 'mathematical',
 'mathematics',
 'mathematically',
 'integral',
 'integration',
 'integrals',
 'integrated',
 'leibniz',
 'differential',
 'differentiation',
 'time',
 'times',
 'newton',
 'limit',
 'limits',
 'limiting']