In [1]:
import pandas as pd
import numpy as np
import re
import nltk

In [2]:
#Because the CSV file is not rectangular, Pandas won't load the file without specifiying column names.
#I will name the columns 'c0','c1','c2'...,'c3391' (the longest row in the CSV file has 3392 columns).
df = pd.read_csv('wikipedia_machine_learning.csv', names=['c' + str(i) for i in range(3392)], dtype='str', encoding='utf-8')\
       .fillna('')

In [3]:
#For ease of text analysis, all of the columns for a given row will be concatenated into a single variable, 'columns_combined'.
df['columns_combined'] = ''
for j in range(df.shape[1]-1):
    df['columns_combined'] += df.loc[:, 'c'+str(j)]

I will proceed by building various functions that will allow me to create the following new columns from the existing columns of df:

• *wikipedia_url* (of the article) <br />
• *article_title* <br />
• *section_titles* (in the article) <br />
• *subsection_titles* (in the article) <br />
• *first_sentence* (of the article) <br />
• *article_length* (number of characters) <br />

In [4]:
#I ended up not using this function.
def wikipedia_url(string):
    """
    Returns the URL of the Wikipedia article, usually embedded in the first column.
    """
    #The URL's are contained within '\t....\t', and re.findall() is saved as a list, hence the [0].
    url = re.findall(r'\t.+?\t', string)[0]
    #We don't want the \t's returned, so we eliminate the first and last characters from the url.
    #Also, the 'https://' substring is saving the strings as URLs, truncating some URL ends, leading to erroneous pages.
    #Therefore I will remove these characters as well.
    return url[9:len(url)-1]

#Add the new 'url' variable to df
df['url'] = df['columns_combined'].apply(wikipedia_url)

In [5]:
def article_title(string):
    """
    Returns the title of the Wikipedia article, usually embedded in the first column.
    """
    #The article titles are at the beginning of the string, before the first '\t'. Also, title is saved as a list, hence the [0].
    title = re.findall(r'.+?\t', string)[0]
    #We don't want to return the \t, so we eliminate the last character.
    return title[:len(title)-1]

#Add the new 'article_title' variable to df.
#Usually the article title is in c0. However, it sometimes spills over to c1 and c2 as well.
df['article_title'] = (df['c0'] + df['c1'] + df['c2']).apply(article_title)

In [6]:
def article_section_titles(string):
    """
    Input: the full article text, found in df['columns_combined'].
    Output: a list of section titles for the input Wikipedia page.
    Typically, the section titles of an article appear within 4 equal signs, with a space before the first, a space after
    the second and no space before the third.
    e.g. ' == Types of artificial intelligence=='
    """
    #Some of the pages have no section titles and need to be treated differently for string methods to work.
    #We will distinguish these 2 cases by the number of occurrences of substrings of the form ' == ...=='.
    section_titles = re.findall(r' == .+?==', string)
    
    if len(section_titles) > 0:
    #The = signs will be removed, as well as the 2 spaces at the beginning of each section title.
        section_titles = list(pd.Series(section_titles)\
                                .str.replace('=','')\
                                .str.lstrip(' '))
        return section_titles
    
    if len(section_titles) == 0:
    #These rows will return an error if we try to apply the string methods from the above case, so we treat them separately.
        return []

#Add the new 'section_titles' variable to df.
df['section_titles'] = df['columns_combined'].apply(article_section_titles)

In [7]:
def article_subsection_titles(string):
    """
    Input: the full article text, found in df['columns_combined'].
    Output: a list of subsection titles for the input Wikipedia page.
    Typically, the section titles of an article appear within 6 equal signs, with a space before the first, a space after
    the third and no space before the fourth.
    e.g. ' === Metric==='
    """
    #Some of the pages have no subsection titles and need to be treated differently for string methods to work.
    #We will distinguish these 2 cases by the number of occurrences of substrings of the form ' === ...==='.
    subsection_titles = re.findall(r' === .+?===', string)
    
    if len(subsection_titles) > 0:
    #The = signs will be removed, as well as the 2 spaces at the beginning of each subsection title.
        subsection_titles = list(pd.Series(subsection_titles)\
                                   .str.replace('=','')\
                                   .str.lstrip(' '))
        return subsection_titles
    
    if len(subsection_titles) == 0:
    #These rows will return an error if we try to apply the string methods from the above case, so we treat them separately.
        return []
    
#Add the new 'subsection_titles' variable to df.
df['subsection_titles'] = df['columns_combined'].apply(article_subsection_titles)

In [8]:
def article_first_sentence(string):
    """
    Returns the first sentence of the Wikipedia article, identified by the first period after the URL.
    Note that the commas from the Wikipedia article will not appear.
    """
    #For some articles there is no sentence in the dataset, and these articles need to be processed separately.
    #For these articles there will be at most 1 substring of the form '\t.+?\.', namely '\thttps://en.'from the URL.
    
    substring = re.findall(r'\t.+?\.', string)
    
    if len(substring) > 1:
        #Ignore the first matched string, '\thttps://en.', and skip to the second.
        string_starting_with_second_tab = substring[1]
        #Sometimes the second \t is followed by ", and then the first sentence begins.
        if string_starting_with_second_tab[1] == '"':
            #Return the string without the \t".
            return string_starting_with_second_tab[2:]
        #Other times it begins immediately after the second \t, without ".
        else:
            #Return the string without the \t.
            return string_starting_with_second_tab[1:]
        
    if len(substring) <= 1:
        #There is no sentence in the article.
        return ''

#Add the new 'first_sentence' variable to df.  
df['first_sentence'] = df['columns_combined'].apply(article_first_sentence)

In [9]:
#I ended up not using this function.
def article_length(string):
    """
    This returns the number of characters in the dataset for the input article.
    """
    return len(string)

#Add the new 'article_length' variable to df.
df['article_length'] = df['columns_combined'].apply(article_length)

In [10]:
df.iloc[:3, 3392:]

Unnamed: 0,columns_combined,url,article_title,section_titles,subsection_titles,first_sentence,article_length
0,Outline of artificial intelligence\thttps://en...,en.wikipedia.org/wiki/Outline_of_artificial_in...,Outline of artificial intelligence,[What type of thing is artificial intelligence...,"[By approach, By application, Integrated AI sy...",The following outline is provided as an overvi...,24206
1,Outline of computer vision\thttps://en.wikiped...,en.wikipedia.org/wiki/Outline_of_computer_vision,Outline of computer vision,"[Branches of computer vision, History of compu...","[Image enhancement, Transformations, Filtering...",The following outline is provided as an overvi...,4431
2,Outline of natural language processing\thttps:...,en.wikipedia.org/wiki/Outline_of_natural_langu...,Outline of natural language processing,"[Natural language processing, Prerequisite tec...","[Applications, Component processes, Timeline o...",The following outline is provided as an overvi...,49401


In [11]:
def preprocess(column):
    """
    This function preprocesses a column of strings (e.g. 'article_title') or a column of lists of strings (e.g. 
    'section_titles'). The input is a column of df written as a string (e.g. 'article_title' or 'section_titles'). 
    Preprocessing consists of removing punctuation, converting to lower-case and removing stop words. The output is a 
    series of all of the words that occur among all the rows of the input column, where each entry is a single word.
    """
    
    #The entries of 'section_titles' and 'subsection_titles' are lists of strings. These columns need to be converted
    #to a single list for the following preprocessing steps to work.
    if column in ['section_titles', 'subsection_titles']:
        #Combine the lists into a single list.
        L = []
        for i in range(df.shape[0]):
            L += df.loc[i, column]
        #Combine the list entries (strings) into a single string
        string = ''
        for i in range(len(L)):
            string += ' ' + L[i]
            
    #The entries of 'article_title', 'first_sentence' and 'columns_combined' are strings.
    else:
        #Combine the strings into a single string.
        string = ''
        for i in range(df.shape[0]):
            string += ' ' + df.loc[i, column]
    
    #Tokenize string into words and remove punctuation.
    word_list = nltk.RegexpTokenizer(r'\w+')\
                    .tokenize(string)
    
    #Convert words to lower-case.
    word_list = [word.lower() for word in word_list]
    
    #Remove stop words.
    #These are default stop words.
    stopwords = set(nltk.corpus.stopwords.words('english'))
    #These are additional stop words I have chosen by looking through the most common words in 'section_titles'.
    extra_stop_words = ['see', 'references', 'also', 'links', 'external', 'history', 'reading', 'notes', 'examples', 
                        'definition', 'overview', 'example', 'related', 'bibliography', 'use', 'users', 'legal', 'two']
    for word in extra_stop_words:
        stopwords.add(word)
    #The removal.
    word_list = [word for word in word_list if word not in stopwords]
    
    #Convert to a series so that we can apply Pandas methods to the output.
    return pd.Series(word_list)

In [12]:
def concatenated_ngrams(preprocessed_column, n):
    """
    This function takes a string as an input, and is intended specifically to take an output from preprocess() as its
    input. It returns the ngrams of a column for n = 2 or 3, a series of strings where each string consists of n words.
    """
    if n == 2:
        #Create the bigrams.
        ngrams = list(nltk.ngrams(preprocessed_column, 2))
        #ngrams is a list of 2-tuples. Combine each pair of elements into a string.
        L = []
        for w1,w2 in ngrams:
            L.append(w1 + ' ' + w2)
        #Convert to a series.
        return pd.Series(L)
    
    if n == 3:
        #Create the 3-grams.
        ngrams = list(nltk.ngrams(preprocessed_column, 3))
        #ngrams is a list of 3-tuples. Combine each triplet of elements into a string.
        L = []
        for w1,w2,w3 in ngrams:
            L.append(w1 + ' ' + w2 + ' ' + w3)
        #Convert to a series.
        return pd.Series(L)

**ARTICLE TITLES**

In [13]:
#Preprocessing the article_title column.
article_title_preprocessed = preprocess('article_title')

#Counting and sorting the most common words among all the rows in article_title.
titles_words_tallied = article_title_preprocessed.value_counts()\
                                                 .sort_values(ascending=False)

#Counting and sorting the most common bigrams among all the rows in article_title.
titles_2grams_tallied = concatenated_ngrams(article_title_preprocessed, 2).value_counts()\
                                                                          .sort_values(ascending=False)

#Counting and sorting the most common trigrams among all the rows in article_title.
titles_3grams_tallied = concatenated_ngrams(article_title_preprocessed, 3).value_counts()\
                                                                          .sort_values(ascending=False)


In [14]:
#An example showing how an output of preprocess() looks.
article_title_preprocessed[:3]

0         outline
1      artificial
2    intelligence
dtype: object

In [15]:
#An example showing how a ..._words_tallied object looks.
titles_words_tallied[:3]

list        225
analysis    146
theory      133
dtype: int64

In [16]:
#An example showing how a ..._2grams_tallied object looks.
titles_2grams_tallied[:3]

artificial intelligence    33
machine learning           26
neural network             25
dtype: int64

In [17]:
#An example showing how a ..._3grams_tallied object looks.
titles_3grams_tallied[:3]

natural language processing     7
buffalo buffalo buffalo         6
principal component analysis    5
dtype: int64

**SECTION TITLES**

In [18]:
#Preprocessing the section_titles column.
section_titles_preprocessed = preprocess('section_titles')

#Counting and sorting the most common words among all the rows in section_titles.
section_titles_words_tallied = section_titles_preprocessed.value_counts()\
                                                          .sort_values(ascending=False)

#Counting and sorting the most common bigrams among all the rows in section_titles.    
section_titles_2grams_tallied = concatenated_ngrams(section_titles_preprocessed, 2).value_counts()\
                                                                                   .sort_values(ascending=False)

#Counting and sorting the most common trigrams among all the rows in section_titles.    
section_titles_3grams_tallied = concatenated_ngrams(section_titles_preprocessed, 3).value_counts()\
                                                                                   .sort_values(ascending=False)

**SUBSECTION TITLES**

In [19]:
#Preprocessing the subsection_titles column.
subsection_titles_preprocessed = preprocess('subsection_titles')

#Counting and sorting the most common words among all the rows in subsection_titles.
subsection_titles_words_tallied = subsection_titles_preprocessed.value_counts()\
                                                                .sort_values(ascending=False)
    
#Counting and sorting the most common bigrams among all the rows in subsection_titles.
subsection_titles_2grams_tallied = concatenated_ngrams(subsection_titles_preprocessed, 2).value_counts()\
                                                                                         .sort_values(ascending=False)

#Counting and sorting the most common trigrams among all the rows in subsection_titles.    
subsection_titles_3grams_tallied = concatenated_ngrams(subsection_titles_preprocessed, 3).value_counts()\
                                                                                         .sort_values(ascending=False)

**FIRST SENTENCES**

In [20]:
#Preprocessing the first_sentence column.
first_sentences_preprocessed = preprocess('first_sentence')

#Counting and sorting the most common words among all the rows in first_sentence.
first_sentences_words_tallied = preprocess('first_sentence').value_counts()\
                                                            .sort_values(ascending=False)

#Counting and sorting the most common bigrams among all the rows in first_sentence.
first_sentence_2grams_tallied = concatenated_ngrams(first_sentences_preprocessed, 2).value_counts()\
                                                                                    .sort_values(ascending=False)

#Counting and sorting the most common trigrams among all the rows in first_sentence.    
first_sentence_3grams_tallied = concatenated_ngrams(first_sentences_preprocessed, 3).value_counts()\
                                                                                    .sort_values(ascending=False)

**COLUMNS COMBINED**

In [21]:
#Preprocessing the columns_combined column.
columns_combined_preprocessed = preprocess('columns_combined')

#Counting and sorting the most common words among all the rows in columns_combined.
columns_combined_words_tallied = columns_combined_preprocessed.value_counts()\
                                                              .sort_values(ascending=False)

#Counting and sorting the most common bigrams among all the rows in columns_combined.
columns_combined_2grams_tallied = concatenated_ngrams(columns_combined_preprocessed, 2).value_counts()\
                                                                                       .sort_values(ascending=False)

#Counting and sorting the most common trigrams among all the rows in columns_combined.    
columns_combined_3grams_tallied = concatenated_ngrams(columns_combined_preprocessed, 3).value_counts()\
                                                                                       .sort_values(ascending=False)