In [1]:
import pickle
import wikipediaapi

import nltk
from nltk.corpus import stopwords
import wikipediaapi
from nltk.tokenize import RegexpTokenizer
import re
from bs4 import BeautifulSoup
import re

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

import fasttext

ps = nltk.stem.porter.PorterStemmer()

In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.neighbors import KNeighborsClassifier

In [3]:
section = "history"

In [4]:
with open('../Instance Classification/section_dict.pkl', 'rb') as f:
    section_dict = pickle.load(f)

## Creating Training Data

In [5]:
wiki_wiki = wikipediaapi.Wikipedia('en', extract_format = wikipediaapi.ExtractFormat.HTML)
section_titles, related_keywords = section_dict[ps.stem(section)]

In [6]:
data_raw = []
documents = []
itr = [0]

def get_data(data_raw, documents, kw, section, itr):

    try:
    
        page = wiki_wiki.page(kw)
        kw_data = []

        for section_ in page.sections:
            title, content = section_.title, section_.text
            soup = BeautifulSoup(content, "html.parser")

            for p in soup.findAll("p"):
                p_text = p.text

                documents.append(TaggedDocument(p_text.lower(), [itr[0]]))
                itr[0] += 1

                for s in nltk.sent_tokenize(p_text):
                    kw_data.append((kw, s, (ps.stem(title) == ps.stem(section))))

        data_raw.extend(kw_data)

    except:
        
        get_data(data_raw, documents, kw, section, itr)

for i, kw in enumerate(related_keywords):
    
    if i % 100 == 0:
        print(i, kw)

    get_data(data_raw, documents, kw, section, itr)

0 epilepsy
100 backpropagation
200 Algorithmic bias
300 business process modelling
400 composition
500 causality
600 self-organization
700 expert systems


In [7]:
df = pd.DataFrame(data_raw, columns = ["keyword", "sentences", "label"])

In [8]:
data = pd.DataFrame(columns = df.columns)
for kw in related_keywords:
    df_subset = df[df["keyword"] == kw]
    
    pos_data = df_subset[df_subset["label"] != False]
    neg_data = df_subset[df_subset["label"] == False]
    
    num_pos, num_neg = len(pos_data), len(neg_data)
    
    if num_pos >= num_neg:
        data = pd.concat((data, pos_data, neg_data), axis = 0)
    else:
        data = pd.concat((data, pos_data, neg_data.sample(n = num_pos)), axis = 0)

## Training fastText

In [9]:
def clean_text(text):
    cleaned_text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    return cleaned_text.lower()
    
def clean_instances(data):
    instances = data["sentences"]
    cleaned_instances = [clean_text(inst) for inst in instances]
    
    df_new = pd.DataFrame(columns = data.columns)
    df_new[data.columns] = data[data.columns]
    df_new["sentences"] = cleaned_instances
    
    return df_new
    

data_clean = clean_instances(data)

train_text, test_text, train_labels, test_labels = train_test_split(data_clean["sentences"], data_clean["label"], test_size = 0.2)

train_text, test_text, np.array(train_text.to_numpy()), np.array(test_text.to_numpy())
train_labels, test_labels = np.array(train_labels), np.array(test_labels)

In [10]:
fasttext_data_train = [f"__label__{str(train_labels[i])} {train_text.to_numpy()[i]}" for i in range(len(train_labels))]
fasttext_data_test = [f"__label__{str(test_labels[i])} {test_text.to_numpy()[i]}" for i in range(len(test_labels))]

train_file = open("model.train", "w+")
test_file = open("model.valid", "w+")

train_file.write("\n".join(fasttext_data_train))
test_file.write("\n".join(fasttext_data_test))

train_file.close()
test_file.close()

In [11]:
cv_data = []

for lr in np.arange(0, 1, 0.05):
    row = []
    for epochs in range(5, 35, 5):
        model_cv = fasttext.train_supervised(input = "model.train", lr = lr, epoch = epochs)
        p = model_cv.test("model.valid")[1]
        row.append(p)
    cv_data.append(row)

In [12]:
max_idx = np.unravel_index(np.array(cv_data).argmax(), np.array(cv_data).shape)
best_learning_rate = np.arange(0, 1, 0.05)[max_idx[0]]
best_epochs = np.arange(5, 35, 5)[max_idx[1]]
max_p = cv_data[max_idx[0]][max_idx[1]]

print(f"Best Learning Rate: {best_learning_rate}\nBest Number of Epochs: {best_epochs}\nMax Precision/Recall: {max_p}")

Best Learning Rate: 0.15000000000000002
Best Number of Epochs: 20
Max Precision/Recall: 0.8372932330827068


In [13]:
model_cv = fasttext.train_supervised(input = "model.train", lr = best_learning_rate, epoch = best_epochs)
n1, p1, r1 = model_cv.test("model.train")
n2, p2, r2 = model_cv.test("model.valid")

In [14]:
print(n1, p1, r1)
print(n2, p2, r2)

13300 0.9978947368421053 0.9978947368421053
3325 0.8330827067669173 0.8330827067669173
