In [110]:
import wikipedia as wiki
import pandas as pd
import numpy as np
import hashlib 
import networkx as nx

# Data Collection

## Get the article titles

In [111]:
def get_hash_id(k): 
    """Returns a 10-byte hash ID for a given string."""

    bytes_ = k.encode("UTF-8")                  
    hash_id = hashlib.sha1(bytes_).hexdigest() 
    hash_id = hash_id[:10]  
    return hash_id

In [112]:
def get_articles(query):
    """Returns a DataFrame with articles on the specified query, q."""

    # track titles that failed to load content
    count = 0

    # create DataFrame to store title, and content
    df = pd.DataFrame(columns=["title", "content"])

    # get the titles from Wikipedia
    titles = wiki.search(query, suggestion=True, results=100)[0]    

    # iterate through titles, store the content for each one in the DataFrame 
    for title in titles: 
        hash_id = get_hash_id(title)

        try: 
            content = wiki.page(title, auto_suggest=False).content
        except Exception: 
            content = ''
        df.at[hash_id, "title"] = title
        df.at[hash_id, "content"] = content

    return df 

In [113]:
# Get articles for each of the following topics: 
# * Computer Science, 
# * Neuroscience, and 
# * Mathematics.

comp_sci = get_articles("Computer Science")
neuro_sci = get_articles("Neuroscience")
maths = get_articles("Mathematics")



  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


### save/read data

In [121]:
# comp_sci.to_csv("wiki_comp_sci.csv", index_label="hash_id")
# neuro_sci.to_csv("wiki_neurosci.csv", index_label="hash_id")
# maths.to_csv("wiki_maths.csv", index_label="hash_id")

comp_sci = pd.read_csv("wiki_comp_sci.csv", index_col="hash_id").fillna('')
neuro_sci = pd.read_csv("wiki_neurosci.csv", index_col="hash_id").fillna('')
maths = pd.read_csv("wiki_maths.csv", index_col="hash_id").fillna('')

# Data Analysis

## Semantic Analysis

In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [169]:
def similarity_matrix(corpus, column=None): 
    """
    corpus: a pandas DataFrame that contains the documents.
    column: the column to be used for pairwise comparison. 
    
    Returns, a DataFrame with pairwise comparisons (cosine similarity) for each document. 
    """
    
    docs = corpus[column].to_numpy()                            # store the relevant documents
    tfidf = TfidfVectorizer().fit_transform(docs)               # vectorize the documents
    pairwise_similarity = tfidf * tfidf.T                       # compute the pairwise cosine similarity 
    pairwise_similarity = pairwise_similarity.toarray()         # convert to numpy 2D array
    df = pd.DataFrame(
        pairwise_similarity,
        index=corpus.index, 
        columns=corpus.index
    )
    df["avg_sim"] = list(map(lambda r: sum(r)/len(r), pairwise_similarity))
    return df 

In [176]:
comp_sci_sim_matrix = similarity_matrix(comp_sci, "content")
neuro_sci_sim_matrix = similarity_matrix(neuro_sci, "content")
maths_sim_matrix = similarity_matrix(maths, "content")

## Graph construction

In [181]:
def create_graph(df, sim_matrix): 
    """
    df: a pandas DataFrame containing the metadata and data of the documents.
    sim_matrix: a pandas DataFrame containing pairwise comparisons for each document in 'df'.
    """
    ids = df.index
    g = nx.Graph()
    for left_node in ids: 
        # get this documents average similarity to others 
        avg_sim = sum(sim_matrix.loc[left_node])/len(sim_matrix.loc[left_node])     

        # add the node to the graph
        g.add_node(
            left_node, 
            title=df.loc[left_node]["title"], 
            avg_sim=sim_matrix.loc[left_node]["avg_sim"]
        )

        # add edges to the graph
        for right_node in ids: 
            if left_node != right_node: 
                sim = sim_matrix.loc[left_node, right_node]
                g.add_edge(left_node, right_node, similarity=sim)    

    return g    


In [182]:
nx.write_graphml(create_graph(comp_sci, comp_sci_sim_matrix), "comp_sci.graphml")
nx.write_graphml(create_graph(neuro_sci, neuro_sci_sim_matrix), "neuro_sci.graphml")
nx.write_graphml(create_graph(maths, maths_sim_matrix), "maths.graphml")

In [153]:
comp_sci.loc['7d5536610a']

title      Dynamics
content            
Name: 7d5536610a, dtype: object

In [183]:
comp_sci_sim_matrix

hash_id,3bc6836e02,70893819f5,da0ffb854b,45d8304a48,dea998ab8a,d1f729928a,a9ea0f0f7d,b98dd77323,6e9eedfc66,adfad3225d,...,6e51068fde,0759768f6f,d2d0663714,955dd95299,e42f38772c,c6a8e0e4b7,cd7bdd308f,fa5359591c,e0e4f002f6,avg_sim
hash_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3bc6836e02,1.000000,0.351192,0.349443,0.483284,0.375917,0.662862,0.360786,0.380135,0.493992,0.363045,...,0.446702,0.634669,0.755535,0.462521,0.281512,0.526038,0.288456,0.412380,0.487204,0.448877
70893819f5,0.351192,1.000000,0.144511,0.265122,0.173091,0.352756,0.079781,0.208823,0.289779,0.210911,...,0.196689,0.276496,0.349889,0.261699,0.150471,0.280928,0.232536,0.267501,0.219546,0.240193
da0ffb854b,0.349443,0.144511,1.000000,0.206990,0.162012,0.277466,0.141606,0.161025,0.213722,0.153252,...,0.185330,0.254433,0.299567,0.181095,0.122673,0.217079,0.119312,0.177320,0.182966,0.193196
45d8304a48,0.483284,0.265122,0.206990,1.000000,0.245426,0.494974,0.143289,0.288223,0.396612,0.318860,...,0.273930,0.373011,0.453407,0.334134,0.210099,0.400153,0.201094,0.308766,0.262569,0.324241
dea998ab8a,0.375917,0.173091,0.162012,0.245426,1.000000,0.337791,0.132180,0.198647,0.253392,0.189154,...,0.205655,0.318185,0.368221,0.234044,0.139923,0.252796,0.146295,0.203929,0.230741,0.224826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
e42f38772c,0.281512,0.150471,0.122673,0.210099,0.139923,0.282029,0.070695,0.162678,0.232263,0.170293,...,0.153349,0.231027,0.261651,0.193343,1.000000,0.238971,0.112630,0.166159,0.144858,0.193677
c6a8e0e4b7,0.526038,0.280928,0.217079,0.400153,0.252796,0.563126,0.154176,0.303488,0.406035,0.338306,...,0.280984,0.427503,0.479860,0.418285,0.238971,1.000000,0.203475,0.306129,0.290014,0.340466
cd7bdd308f,0.288456,0.232536,0.119312,0.201094,0.146295,0.261350,0.092966,0.146412,0.226329,0.166220,...,0.154483,0.243682,0.265585,0.183444,0.112630,0.203475,1.000000,0.232303,0.163677,0.195202
fa5359591c,0.412380,0.267501,0.177320,0.308766,0.203929,0.402891,0.128867,0.220320,0.332511,0.234769,...,0.238225,0.333706,0.408366,0.282039,0.166159,0.306129,0.232303,1.000000,0.246132,0.273200


19.436731484059344