In [1]:
import pickle
import wikipediaapi

import nltk
from nltk.corpus import stopwords
import wikipediaapi
from nltk.tokenize import RegexpTokenizer
import re
from bs4 import BeautifulSoup
import re

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

ps = nltk.stem.porter.PorterStemmer()

In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier

In [3]:
section = "definition"

In [4]:
with open('../Instance Classification/section_dict.pkl', 'rb') as f:
    section_dict = pickle.load(f)

## Creating Training Data

In [5]:
wiki_wiki = wikipediaapi.Wikipedia('en', extract_format = wikipediaapi.ExtractFormat.HTML)
section_titles, related_keywords = section_dict[ps.stem(section)]

In [6]:
data_raw = []
documents = []
itr = [0]

def get_data(data_raw, documents, kw, section, itr):

    try:
    
        page = wiki_wiki.page(kw)
        kw_data = []

        for section_ in page.sections:
            title, content = section_.title, section_.text
            soup = BeautifulSoup(content, "html.parser")

            for p in soup.findAll("p"):
                p_text = p.text

                documents.append(TaggedDocument(p_text.lower(), [itr[0]]))
                itr[0] += 1

                for s in nltk.sent_tokenize(p_text):
                    kw_data.append((kw, s, (ps.stem(title) == ps.stem(section))))

        data_raw.extend(kw_data)

    except:
        
        get_data(data_raw, documents, kw, section, itr)

for i, kw in enumerate(related_keywords):
    
    if i % 100 == 0:
        print(i, kw)

    get_data(data_raw, documents, kw, section, itr)

0 generalization error
100 convolutional neural network
200 fuzzy set


In [7]:
df = pd.DataFrame(data_raw, columns = ["keyword", "sentences", "label"])

In [8]:
data = pd.DataFrame(columns = df.columns)
for kw in related_keywords:
    df_subset = df[df["keyword"] == kw]
    
    pos_data = df_subset[df_subset["label"] != False]
    neg_data = df_subset[df_subset["label"] == False]
    
    num_pos, num_neg = len(pos_data), len(neg_data)
    
    if num_pos >= num_neg:
        data = pd.concat((data, pos_data, neg_data), axis = 0)
    else:
        data = pd.concat((data, pos_data, neg_data.sample(n = num_pos)), axis = 0)

## Training Doc2Vec

In [9]:
def clean_text(text):
    cleaned_text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    return cleaned_text.lower()
    
def clean_instances(data):
    instances = data["sentences"]
    cleaned_instances = [clean_text(inst) for inst in instances]
    
    df_new = pd.DataFrame(columns = data.columns)
    df_new[data.columns] = data[data.columns]
    df_new["sentences"] = cleaned_instances
    
    return df_new
    

data_clean = clean_instances(data)

train_text, test_text, train_labels, test_labels = train_test_split(data_clean["sentences"], data_clean["label"], test_size = 0.2)

train_text, test_text, np.array(train_text), np.array(test_text)
train_labels, test_labels = np.array(train_labels), np.array(test_labels)

In [10]:
sent_tokens_train = []
for i, s in enumerate(train_text):    
    sent_tokens_train.append(nltk.tokenize.word_tokenize(s.lower()))
    
sent_tokens_test = []
for i, s in enumerate(test_text):    
    sent_tokens_test.append(nltk.tokenize.word_tokenize(s.lower()))

In [11]:
model = Doc2Vec(documents, vector_size = 100, window = 2, min_count = 1, workers = 4, epochs = 40)

In [12]:
sent_vector_train = []
for tokens in sent_tokens_train:
    sent_vector_train.append(model.infer_vector(tokens))
sent_vector_train = np.array(sent_vector_train)

sent_vector_test = []
for tokens in sent_tokens_test:
    sent_vector_test.append(model.infer_vector(tokens))
sent_vector_test = np.array(sent_vector_test)

In [13]:
best_score = 0
best_depth = 0

for max_depth in np.arange(5, 55, 10):
    
    print(f"Trying depth {max_depth}")
    
    rf = RandomForestClassifier(n_estimators = 1000, max_depth = max_depth)
    rf.fit(sent_vector_train, train_labels.astype(int))
    
    curr_score = rf.score(sent_vector_test, test_labels.astype(int))
    if curr_score > best_score:
        best_score = curr_score
        best_depth = max_depth
        
print(f"Best depth: {best_depth}")

Trying depth 5
Trying depth 15
Trying depth 25
Trying depth 35
Trying depth 45
Best depth: 45


In [14]:
rf.score(sent_vector_train, train_labels.astype(int))

0.9997249724972497

In [15]:
rf.score(sent_vector_test, test_labels.astype(int))

0.6083608360836084