In [1]:
import pandas as pd
import spacy
import re
nlp = spacy.load("en_core_web_sm")

ModuleNotFoundError: No module named 'spacy'

## Preview data

In [None]:
questions = pd.read_csv("datasets/questions.csv")
answers = pd.read_csv("datasets/answers.csv", encoding="ISO-8859-1")

## Define SpaCy tokenizers and taggers

In [5]:
def preprocess(text):
    text = re.sub('<[^<]+?>|', '', text)
    text = " ".join(text.split())
    return nlp(text)

def get_tokens(doc):
    return [token.text for token in doc]

def get_entities(doc):
    return [ent.text for ent in doc.ents]

def get_entity_labels(doc):
    return [ent.label_ for ent in doc.ents]

## Apply SpaCy tokenizers and taggers

In [6]:
questions['title_doc'] = questions['title'].apply(lambda text: preprocess(text))
questions['body_doc'] = questions['body'].apply(lambda text: preprocess(text))

questions['title_tokens'] = questions['title_doc'].apply(lambda doc: get_tokens(doc))
questions['title_entities'] = questions['title_doc'].apply(lambda doc: get_entities(doc))
questions['body_tokens'] = questions['body_doc'].apply(lambda doc: get_tokens(doc))
questions['body_entities'] = questions['body_doc'].apply(lambda doc: get_entities(doc))

In [7]:
answers['body_doc'] = answers['body'].apply(lambda text: preprocess(text))

answers['body_tokens'] = answers['body_doc'].apply(lambda doc: get_tokens(doc))
answers['body_entities'] = answers['body_doc'].apply(lambda doc: get_entities(doc))

## Preview freshly added SpaCy columns

In [12]:
question_cols = ['title_doc', 'body_doc', 'title_tokens', 'title_entities', 'body_tokens', 'body_entities']
answer_cols = ['body_doc', 'body_tokens', 'body_entities']

In [13]:
questions[question_cols].head()

Unnamed: 0,title_doc,body_doc,title_tokens,title_entities,body_tokens,body_entities
0,"(PHP, includes, vs, OOP)","(I, would, like, to, have, a, reference, for, ...","[PHP, includes, vs, OOP]",[PHP],"[I, would, like, to, have, a, reference, for, ...","[PHP, one, A Simple Example: Certain pages on,..."
1,"(WYSIWYG, editor, gem, for, Rails, ?)","(Is, there, a, good, ruby, gem, for, a, WYSIWY...","[WYSIWYG, editor, gem, for, Rails, ?]",[WYSIWYG],"[Is, there, a, good, ruby, gem, for, a, WYSIWY...",[WYSIWYG]
2,"(How, do, you, automate, a, Visual, Studio, bu...","(How, do, you, turn, a, Visual, Studio, build,...","[How, do, you, automate, a, Visual, Studio, bu...",[Visual Studio],"[How, do, you, turn, a, Visual, Studio, build,...",[IDE]
3,"(Code, Classic, ASP, in, Linux)","(What, should, i, use, to, code, Classic, ASP,...","[Code, Classic, ASP, in, Linux]","[Classic ASP, Linux]","[What, should, i, use, to, code, Classic, ASP,...","[Classic ASP, Linux, Emacs, Vim, Code Browser]"
4,"(What, are, the, pros, and, cons, of, the, ass...","(I, am, considering, creating, my, own, websit...","[What, are, the, pros, and, cons, of, the, ass...",[],"[I, am, considering, creating, my, own, websit...","[Java, Java, more than 50, one, Java]"


In [10]:
answers[answer_cols].head()

Unnamed: 0,body_doc,body_tokens,body_entities
0,"(If, there, are, multiple, records, for, given...","[If, there, are, multiple, records, for, given...","[this[&quot;reportKey&quot, reportKey.toUpperC..."
1,"(If, you, end, up, here, because, you, Googled...","[If, you, end, up, here, because, you, Googled...",[two frustrating hours]
2,"(It, seems, to, be, an, SDL, issue, ., Here, i...","[It, seems, to, be, an, SDL, issue, ., Here, i...","[SDL, SDL, SDL]"
3,"(This, could, be, an, installation, and, setup...","[This, could, be, an, installation, and, setup...","[Files\Java\jdk1.8.0_291&quot, Files\Java\jre1..."
4,"(As, per, java, command, line, documentation, ...","[As, per, java, command, line, documentation, ...",[JAR]


## Save to CSV

In [17]:
questions.to_csv('datasets/tokenized_questions.csv')
answers.to_csv('datasets/tokenized_answers.csv')

In [None]:
questions = pd.read_csv("datasets/tokenized_questions.csv")
answers = pd.read_csv("datasets/tokenized_answers.csv", lineterminator='\n', encoding="ISO-8859-1")

In [None]:
questions = questions.rename(columns={'user_id': 'question_user_id', 'score': 'question_score', 'body': 'question_body', 'body_doc': 'question_body_doc', 'body_tokens': 'question_body_token', 'body_entities': 'question_body_entities'})
answers = answers.rename(columns={'user_id': 'answer_user_id', 'score': 'answer_score', 'body': 'answer_body', 'body_doc': 'answer_body_doc', 'body_tokens': 'answer_body_token', 'body_entities': 'answer_body_entities'})

In [None]:
result = pd.merge(questions, answers, on=["question_id"]).drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'])
result.head()

In [None]:
result.to_csv('datasets/questions_answers_combined.csv')