In [1]:
import pandas as pd
import spacy
import re
nlp = spacy.load("en_core_web_sm")

## Preview data

In [2]:
questions = pd.read_csv("datasets/questions.csv", encoding="ISO-8859-1")
answers = pd.read_csv("datasets/answers.csv", encoding="ISO-8859-1")

## Define SpaCy tokenizers and taggers

In [3]:
def preprocess(text):
    text = re.sub('<[^<]+?>|', '', text)
    text = " ".join(text.split())
    return nlp(text)

def get_tokens(doc):
    return [token.text for token in doc]

def get_entities(doc):
    return [ent.text for ent in doc.ents]

def get_entity_labels(doc):
    return [ent.label_ for ent in doc.ents]

## Apply SpaCy tokenizers and taggers

In [4]:
questions['title_doc'] = questions['title'].apply(lambda text: preprocess(text))
questions['body_doc'] = questions['body'].apply(lambda text: preprocess(text))

questions['title_tokens'] = questions['title_doc'].apply(lambda doc: get_tokens(doc))
questions['title_entities'] = questions['title_doc'].apply(lambda doc: get_entities(doc))
questions['body_tokens'] = questions['body_doc'].apply(lambda doc: get_tokens(doc))
questions['body_entities'] = questions['body_doc'].apply(lambda doc: get_entities(doc))

In [5]:
answers['body_doc'] = answers['body'].apply(lambda text: preprocess(text))

answers['body_tokens'] = answers['body_doc'].apply(lambda doc: get_tokens(doc))
answers['body_entities'] = answers['body_doc'].apply(lambda doc: get_entities(doc))

## Preview freshly added SpaCy columns

In [6]:
question_cols = ['title_doc', 'body_doc', 'title_tokens', 'title_entities', 'body_tokens', 'body_entities']
answer_cols = ['body_doc', 'body_tokens', 'body_entities']

In [7]:
questions[question_cols].head()

Unnamed: 0,title_doc,body_doc,title_tokens,title_entities,body_tokens,body_entities
0,"(PHP, includes, vs, OOP)","(I, would, like, to, have, a, reference, for, ...","[PHP, includes, vs, OOP]",[PHP],"[I, would, like, to, have, a, reference, for, ...","[PHP, one, A Simple Example: Certain pages on,..."
1,"(WYSIWYG, editor, gem, for, Rails, ?)","(Is, there, a, good, ruby, gem, for, a, WYSIWY...","[WYSIWYG, editor, gem, for, Rails, ?]",[WYSIWYG],"[Is, there, a, good, ruby, gem, for, a, WYSIWY...",[WYSIWYG]
2,"(How, do, you, automate, a, Visual, Studio, bu...","(How, do, you, turn, a, Visual, Studio, build,...","[How, do, you, automate, a, Visual, Studio, bu...",[Visual Studio],"[How, do, you, turn, a, Visual, Studio, build,...",[IDE]
3,"(Code, Classic, ASP, in, Linux)","(What, should, i, use, to, code, Classic, ASP,...","[Code, Classic, ASP, in, Linux]","[Classic ASP, Linux]","[What, should, i, use, to, code, Classic, ASP,...","[Classic ASP, Linux, Emacs, Vim, Code Browser]"
4,"(What, are, the, pros, and, cons, of, the, ass...","(I, am, considering, creating, my, own, websit...","[What, are, the, pros, and, cons, of, the, ass...",[],"[I, am, considering, creating, my, own, websit...","[Java, Java, more than 50, one, Java]"


In [8]:
answers[answer_cols].head()

Unnamed: 0,body_doc,body_tokens,body_entities
0,"(If, there, are, multiple, records, for, given...","[If, there, are, multiple, records, for, given...","[this[&quot;reportKey&quot, reportKey.toUpperC..."
1,"(If, you, end, up, here, because, you, Googled...","[If, you, end, up, here, because, you, Googled...",[two frustrating hours]
2,"(It, seems, to, be, an, SDL, issue, ., Here, i...","[It, seems, to, be, an, SDL, issue, ., Here, i...","[SDL, SDL, SDL]"
3,"(This, could, be, an, installation, and, setup...","[This, could, be, an, installation, and, setup...","[Files\Java\jdk1.8.0_291&quot, Files\Java\jre1..."
4,"(As, per, java, command, line, documentation, ...","[As, per, java, command, line, documentation, ...",[JAR]


In [9]:
questions = questions.rename(columns={'user_id': 'question_user_id', 'score': 'question_score', 'body': 'question_body', 'body_doc': 'question_body_doc', 'body_tokens': 'question_body_token', 'body_entities': 'question_body_entities'})
answers = answers.rename(columns={'user_id': 'answer_user_id', 'score': 'answer_score', 'body': 'answer_body', 'body_doc': 'answer_body_doc', 'body_tokens': 'answer_body_token', 'body_entities': 'answer_body_entities'})

In [12]:
questions.columns

Index(['question_user_id', 'accepted_answer_id', 'answer_count',
       'question_score', 'creation_date', 'question_id', 'link', 'title',
       'question_body', 'title_doc', 'question_body_doc', 'title_tokens',
       'title_entities', 'question_body_token', 'question_body_entities'],
      dtype='object')

In [13]:
result = pd.merge(questions, answers, on=["question_id"])
result.head()

Unnamed: 0,question_user_id,accepted_answer_id,answer_count,question_score,creation_date,question_id,link,title,question_body,title_doc,...,question_body_entities,answer_id,is_accepted,answer_body,answer_user_id,reputation,answer_score,answer_body_doc,answer_body_token,answer_body_entities
0,2490.0,22587,6,11,8/22/08,22528,https://stackoverflow.com/questions/22528/php-...,PHP includes vs OOP,<p>I would like to have a reference for the pr...,"(PHP, includes, vs, OOP)",...,"[PHP, one, A Simple Example: Certain pages on,...",22594,False,<p>While the question touches on a couple of v...,2494.0,17497.0,5,"(While, the, question, touches, on, a, couple,...","[While, the, question, touches, on, a, couple,...","[User, second, Konrad, OOP, PHP]"
1,2490.0,22587,6,11,8/22/08,22528,https://stackoverflow.com/questions/22528/php-...,PHP includes vs OOP,<p>I would like to have a reference for the pr...,"(PHP, includes, vs, OOP)",...,"[PHP, one, A Simple Example: Certain pages on,...",22587,True,<p>These are not really opposite choices. You ...,341.0,17787.0,13,"(These, are, not, really, opposite, choices, ....","[These, are, not, really, opposite, choices, ....","[PHP3, eCommerce PHP]"
2,2490.0,22587,6,11,8/22/08,22528,https://stackoverflow.com/questions/22528/php-...,PHP includes vs OOP,<p>I would like to have a reference for the pr...,"(PHP, includes, vs, OOP)",...,"[PHP, one, A Simple Example: Certain pages on,...",22578,False,<p>Whether you do it in classes or in a more p...,1344.0,21216.0,0,"(Whether, you, do, it, in, classes, or, in, a,...","[Whether, you, do, it, in, classes, or, in, a,...","[three, one, Session, Session, 0, 1, 2, PHPSES..."
3,2490.0,22587,6,11,8/22/08,22528,https://stackoverflow.com/questions/22528/php-...,PHP includes vs OOP,<p>I would like to have a reference for the pr...,"(PHP, includes, vs, OOP)",...,"[PHP, one, A Simple Example: Certain pages on,...",22551,False,<p>I've learned never to use <code>include</co...,1968.0,480242.0,1,"(I, 've, learned, never, to, use, include, in,...","[I, 've, learned, never, to, use, include, in,...","[PHP, one]"
4,2490.0,22587,6,11,8/22/08,22528,https://stackoverflow.com/questions/22528/php-...,PHP includes vs OOP,<p>I would like to have a reference for the pr...,"(PHP, includes, vs, OOP)",...,"[PHP, one, A Simple Example: Certain pages on,...",22537,False,<p>Can you be a bit more specific? For the exa...,1830.0,10583.0,0,"(Can, you, be, a, bit, more, specific, ?, For,...","[Can, you, be, a, bit, more, specific, ?, For,...","[1, 2, User]"


In [14]:
result.to_csv('datasets/questions_answers_combined.csv', index=False)