In [1]:
import pandas as pd
import spacy
import re
nlp = spacy.load("en_core_web_sm")

## Preview data

In [3]:
questions = pd.read_csv("datasets/questions.csv", encoding="ISO-8859-1")
answers = pd.read_csv("datasets/answers.csv", encoding="ISO-8859-1")

## Define SpaCy tokenizers and taggers

In [4]:
def preprocess(text):
    text = re.sub('<[^<]+?>|', '', text)
    text = " ".join(text.split())
    return nlp(text)

def get_tokens(doc):
    return [token.text for token in doc]

def get_entities(doc):
    return [ent.text for ent in doc.ents]

def get_entity_labels(doc):
    return [ent.label_ for ent in doc.ents]

## Apply SpaCy tokenizers and taggers

In [5]:
questions['title_doc'] = questions['title'].apply(lambda text: preprocess(text))
questions['body_doc'] = questions['body'].apply(lambda text: preprocess(text))

questions['title_tokens'] = questions['title_doc'].apply(lambda doc: get_tokens(doc))
questions['title_entities'] = questions['title_doc'].apply(lambda doc: get_entities(doc))
questions['body_tokens'] = questions['body_doc'].apply(lambda doc: get_tokens(doc))
questions['body_entities'] = questions['body_doc'].apply(lambda doc: get_entities(doc))

In [None]:
answers['body_doc'] = answers['body'].apply(lambda text: preprocess(text))

answers['body_tokens'] = answers['body_doc'].apply(lambda doc: get_tokens(doc))
answers['body_entities'] = answers['body_doc'].apply(lambda doc: get_entities(doc))

## Preview freshly added SpaCy columns

In [None]:
question_cols = ['title_doc', 'body_doc', 'title_tokens', 'title_entities', 'body_tokens', 'body_entities']
answer_cols = ['body_doc', 'body_tokens', 'body_entities']

In [None]:
questions[question_cols].head()

In [None]:
answers[answer_cols].head()

## Save to CSV

In [None]:
questions.to_csv('datasets/tokenized_questions.csv')
answers.to_csv('datasets/tokenized_answers.csv')