In [110]:
import wikipedia as wiki
import pandas as pd
import numpy as np
import hashlib 
import networkx as nx

# Data Collection

## Get the article titles

In [111]:
def get_hash_id(k): 
    """Returns a 10-byte hash ID for a given string."""

    bytes_ = k.encode("UTF-8")                  
    hash_id = hashlib.sha1(bytes_).hexdigest() 
    hash_id = hash_id[:10]  
    return hash_id

In [112]:
def get_articles(query):
    """Returns a DataFrame with articles on the specified query, q."""

    # track titles that failed to load content
    count = 0

    # create DataFrame to store title, and content
    df = pd.DataFrame(columns=["title", "content"])

    # get the titles from Wikipedia
    titles = wiki.search(query, suggestion=True, results=100)[0]    

    # iterate through titles, store the content for each one in the DataFrame 
    for title in titles: 
        hash_id = get_hash_id(title)

        try: 
            content = wiki.page(title, auto_suggest=False).content
        except Exception: 
            content = ''
        df.at[hash_id, "title"] = title
        df.at[hash_id, "content"] = content

    return df 

In [113]:
# Get articles for each of the following topics: 
# * Computer Science, 
# * Neuroscience, and 
# * Mathematics.

comp_sci = get_articles("Computer Science")
neuro_sci = get_articles("Neuroscience")
maths = get_articles("Mathematics")



  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


### save/read data

In [121]:
# comp_sci.to_csv("wiki_comp_sci.csv", index_label="hash_id")
# neuro_sci.to_csv("wiki_neurosci.csv", index_label="hash_id")
# maths.to_csv("wiki_maths.csv", index_label="hash_id")

comp_sci = pd.read_csv("wiki_comp_sci.csv", index_col="hash_id").fillna('')
neuro_sci = pd.read_csv("wiki_neurosci.csv", index_col="hash_id").fillna('')
maths = pd.read_csv("wiki_maths.csv", index_col="hash_id").fillna('')

# Data Analysis

## Semantic Analysis

In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [123]:
def similarity_matrix(corpus, column=None): 
    """
    corpus: a pandas DataFrame that contains the documents.
    column: the column to be used for pairwise comparison. 
    
    Returns, a DataFrame with pairwise comparisons (cosine similarity) for each document. 
    """
    
    docs = corpus[column].to_numpy()                            # store the relevant documents
    tfidf = TfidfVectorizer().fit_transform(docs)               # vectorize the documents
    pairwise_similarity = tfidf * tfidf.T                       # compute the pairwise cosine similarity 
    pairwise_similarity = pairwise_similarity.toarray()         # convert to numpy 2D array
    df = pd.DataFrame(
        pairwise_similarity,
        index=corpus.index, 
        columns=corpus.index
    )
    return df 

In [124]:
comp_sci_sim_matrix = similarity_matrix(comp_sci, "content")
neuro_sci_sim_matrix = similarity_matrix(neuro_sci, "content")
maths_sim_matrix = similarity_matrix(maths, "content")

## Graph construction

In [146]:
def create_graph(df, sim_matrix): 
    """
    df: a pandas DataFrame containing the metadata and data of the documents.
    sim_matrix: a pandas DataFrame containing pairwise comparisons for each document in 'df'.
    """
    ids = df.index
    g = nx.Graph()
    for left_node in ids: 
        # get this documents average similarity to others 
        avg_sim = sum(sim_matrix.loc[left_node])/len(sim_matrix.loc[left_node])     

        # add the node to the graph
        g.add_node(left_node, title=df.loc[left_node]["title"], avg_sim=avg_sim)

        # add edges to the graph
        for right_node in ids: 
            if left_node != right_node: 
                sim = sim_matrix.loc[left_node, right_node]
                g.add_edge(left_node, right_node, similarity=sim)    

    return g    


In [147]:
nx.write_graphml(create_graph(comp_sci, comp_sci_sim_matrix), "comp_sci.graphml")
nx.write_graphml(create_graph(neuro_sci, neuro_sci_sim_matrix), "neuro_sci.graphml")
nx.write_graphml(create_graph(maths, maths_sim_matrix), "maths.graphml")

In [153]:
comp_sci.loc['7d5536610a']

title      Dynamics
content            
Name: 7d5536610a, dtype: object

In [163]:
sum(comp_sci_sim_matrix.loc['0957e6b9c8'])

16.1104910179196

In [168]:
sum(comp_sci_sim_matrix.loc['d16723e94c'])

19.436731484059344