# Lexical Features
- Term frequency for tagged entities
- Number of links
- Number of code snippets
- Measures of politeness

## Politeness

In [47]:
import convokit
from convokit import Corpus, Speaker, Utterance
from convokit import TextParser
from convokit import download
from convokit import PolitenessStrategies
import pandas as pd
import spacy
import re

In [48]:
def preprocess(text):
    text = re.sub('<[^<]+?>|', '', text)
    text = " ".join(text.split())
    return text

In [49]:
from tqdm import tqdm
from convokit import Corpus, Speaker, Utterance
from collections import defaultdict

In [50]:
answers = pd.read_csv("datasets/answers.csv", encoding="ISO-8859-1")
questions = pd.read_csv("datasets/questions.csv", encoding="ISO-8859-1")

In [51]:
answers.head()

Unnamed: 0,question_id,answer_id,is_accepted,body,user_id,reputation,score
0,67234961,67236964,True,<p>If there are multiple records for given key...,11641029.0,5078.0,0
1,35793622,67239365,False,<p>If you end up here because you Googled &quo...,1740065.0,125.0,0
2,35793622,35794243,True,"<p>It seems to be an SDL issue. <a href=""http:...",1458617.0,10998.0,2
3,56035960,67239339,False,<p>This could be an installation and setup pro...,10665896.0,11.0,0
4,56035960,56042011,True,"<p>As per <a href=""https://docs.oracle.com/jav...",2897748.0,116880.0,1


In [52]:
questions['body'] = questions['body'].apply(lambda text: preprocess(text))

In [57]:
answers['body'] = answers['body'].apply(lambda text: preprocess(text))
answers['body']

0       If there are multiple records for given key, y...
1       If you end up here because you Googled &quot;e...
2       It seems to be an SDL issue. Here is a discuss...
3       This could be an installation and setup proble...
4       As per java command line documentation: When y...
                              ...                        
9144    The finally clause executes after the return s...
9145    According to the MSDN Library (Microsoft.Windo...
9146    Without seeing more code, it's hard to guess w...
9147    According to the JavaDoc: It remains valid unt...
9148    Yes. Locks depend on a file descriptor. When t...
Name: body, Length: 9149, dtype: object

In [59]:
answers['body'].to_csv("preprocessed_answers.csv", index=False)

In [7]:
# 1. Creating speakers
    
speaker_meta = {}
for index, row in answers.iterrows():
    speaker_meta[str(row['user_id'])] = {
        "user_id": row['user_id'],
        "reputation":row['reputation'],
        "score": row['score']
    }
    
corpus_speakers = {k: Speaker(id = k, meta = v) for k,v in speaker_meta.items()}
print("number of speakers in the data = {}".format(len(corpus_speakers)))
corpus_speakers['11641029.0'].meta

number of speakers in the data = 4674


{'user_id': 11641029.0, 'reputation': 5078.0, 'score': 0}

In [8]:
# 2. Creating utterance objects   
utterance_corpus = {}
count = 0
for index, row in answers.iterrows():
    utterance_corpus[str(index)] = Utterance(id=str(index), speaker=corpus_speakers[str(row['user_id'])], text=row['body'], meta={'answer_id': str(row['answer_id']), 'question_id': str(row['question_id'])})
    
print("Total number of utterances = {}".format(len(utterance_corpus)))
utterance_corpus[str(0)]

Total number of utterances = 9149


Utterance({'obj_type': 'utterance', 'meta': {'answer_id': '67236964', 'question_id': '67234961'}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {'user_id': 11641029.0, 'reputation': 5078.0, 'score': 0}, 'vectors': [], 'owner': None, 'id': '11641029.0'}), 'conversation_id': None, 'reply_to': None, 'timestamp': None, 'text': "If there are multiple records for given key, you were overriding it everytime. Update your inner loop with following code. for (const [reportKey, reportValue] of Object.entries(value)) { // console.log(reportKey, reportValue); // this[&quot;reportKey&quot;] = reportKey; if ( reportKey.toUpperCase() != horizontal &amp;&amp; reportKey.toUpperCase() != vertical ) { // this[&quot;reportKey&quot;] = this.getMetricsName(this[&quot;reportKey&quot;]); let rK = this.getMetricsName(reportKey); // let dataSet = []; // dataSet.push( let dataset = { x: xAxis, y: yAxis, value: reportValue, &quot;x-axis&quot;: horizontal, &quot;y-axis&quot;: vertical, &quot;x-n

In [18]:
#  Updating root and reply_to information to utterances

import ast

for index, row in answers.iterrows():
    utterance_corpus[str(index)].conversation_id = str(row['answer_id'])
    utterance_corpus[str(index)].reply_to = str(row['question_id'])
                
utterance_corpus[str(0)]

Utterance({'obj_type': 'utterance', 'meta': {'answer_id': '67236964', 'question_id': '67234961'}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {'user_id': 11641029.0, 'reputation': 5078.0, 'score': 0}, 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x1288fd7c0>, 'id': '11641029.0'}), 'conversation_id': '67236964', 'reply_to': '67234961', 'timestamp': None, 'text': "If there are multiple records for given key, you were overriding it everytime. Update your inner loop with following code. for (const [reportKey, reportValue] of Object.entries(value)) { // console.log(reportKey, reportValue); // this[&quot;reportKey&quot;] = reportKey; if ( reportKey.toUpperCase() != horizontal &amp;&amp; reportKey.toUpperCase() != vertical ) { // this[&quot;reportKey&quot;] = this.getMetricsName(this[&quot;reportKey&quot;]); let rK = this.getMetricsName(reportKey); // let dataSet = []; // dataSet.push( let dataset = { x: xAxis, y: yAxis, value: reportValue, &quot;x-axi

In [10]:
# 3. Creating corpus from list of utterances
utterance_list = utterance_corpus.values()
answer_corpus = Corpus(utterances=utterance_list)
convo_ids = answer_corpus.get_conversation_ids()
for i, convo_idx in enumerate(convo_ids[0:5]):
    print("sample conversation {}:".format(i))
    print(answer_corpus.get_conversation(convo_idx).get_utterance_ids())

sample conversation 0:
['0']
sample conversation 1:
['1']
sample conversation 2:
['2']
sample conversation 3:
['3']
sample conversation 4:
['4']
