In [2]:
import wikipedia as wiki
import pandas as pd
import numpy as np
import hashlib 
import networkx as nx

# Data Collection

## Get the article titles

In [10]:
def get_hash_id(k): 
    """Returns a 10-byte hash ID for a given string."""

    bytes_ = k.encode("UTF-8")                  
    hash_id = hashlib.sha1(bytes_).hexdigest() 
    hash_id = hash_id[:10]  
    return hash_id

In [11]:
def get_articles(query):
    """Returns a DataFrame with articles on the specified query, q."""

    # track titles that failed to load content
    count = 0

    # create DataFrame to store title, and content
    df = pd.DataFrame(columns=["title", "content"])

    # get the titles from Wikipedia
    titles = wiki.search(query, suggestion=True, results=100)[0]    

    # iterate through titles, store the content for each one in the DataFrame 
    for title in titles: 
        hash_id = get_hash_id(title)

        try: 
            content = wiki.page(title, auto_suggest=False).content
        except Exception: 
            content = ''
        df.at[hash_id, "title"] = title
        df.at[hash_id, "content"] = content

    return df 

In [12]:
# Get articles for each of the following topics: 
# * Computer Science, 
# * Neuroscience, and 
# * Mathematics.

comp_sci = get_articles("Computer Science")
neuro_sci = get_articles("Neuroscience")
maths = get_articles("Mathematics")



  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


### save/read data

In [3]:
comp_sci.to_csv("wiki_comp_sci.csv")
neuro_sci.to_csv("wiki_neurosci.csv")
maths.to_csv("wiki_maths.csv")

# comp_sci = pd.read_csv("wiki_comp_sci.csv")
# neuro_sci = pd.read_csv("wiki_neurosci.csv")
# maths = pd.read_csv("wiki_maths.csv")

# Data Analysis

## Semantic Analysis

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
def similarity_matrix(corpus, column=None): 
    """
    corpus: a pandas DataFrame that contains the documents.
    column: the column to be used to performed pairwise comparison. 
    
    Returns, a DataFrame with pairwise comparisons for each document. 
    """

    tfidf = TfidfVectorizer().fit_transform(corpus[column])     # vectorize the documents
    pairwise_similarity = tfidf * tfidf.T                       # compute the pairwise cosine similarity 
    
    df = pd.DataFrame(
        pairwise_similarity,
        index=corpus.index, 
        columns=corpus.index,
    )

    return pairwise_similarity.toarray()

In [9]:
comp_sci_sim_matrix = similarity_matrix(comp_sci, "content")

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
cosine_similarity = pd.DataFrame(                   # create DataFrame to store cosine similarity values
    pairwise_similarity.toarray(), 
    index=pages_df["hash_id"], 
    columns=pages_df["hash_id"])
cosine_similarity

In [None]:
cosine_similarity['average_similarity'] = cosine_similarity.apply(lambda r: sum(r)/len(r), axis=1)

## Graph construction

In [None]:
pages_df.index = pages_df.hash_id
pages_df = pages_df.drop(labels=['hash_id'], axis=1)

In [None]:
# create a nx.Graph object 
pages_graph = nx.Graph()
page_ids = cosine_similarity.index.to_list() # get list of article IDs

# iterate over the page IDs
for left_node in page_ids: 
    # add the article and its title to the graph
    pages_graph.add_node(
        left_node, 
        title=pages_df.loc[left_node]["title"], 
        average_similarity=cosine_similarity.loc[left_node]["average_similarity"]
    )

    # iterate over the other page IDs
    for right_node in page_ids:
        # avoid self-loops (when a node has an edge to itself)
        if left_node != right_node: 
            # add the node and its edges to the graph (cosine similarity score)
            pages_graph.add_edge(left_node, right_node, cosine_similarity=cosine_similarity.loc[left_node, right_node])

# Sample node:edge 
print(f"f511669021 -> 972d8cef69 - Cosine Similarity: {pages_graph.get_edge_data('f511669021', '972d8cef69')}")
for n in pages_graph.nodes.data(): print(n)


In [None]:
nx.write_graphml(pages_graph, "wikipedia-graph.graphml")

In [None]:
cosine_similarity.average_similarity.sort_values()

In [None]:
titles