# Lexical Features
- Term frequency for tagged entities
- Number of links
- Number of code snippets
- Measures of politeness

## Politeness

In [1]:
import convokit
from convokit import Corpus, Speaker, Utterance
from convokit import TextParser
from convokit import download
from convokit import PolitenessStrategies
import pandas as pd
import spacy
import re

In [2]:
def preprocess(text):
    text = re.sub('<[^<]+?>|', '', text)
    text = " ".join(text.split())
    return text

In [3]:
from tqdm import tqdm
from convokit import Corpus, Speaker, Utterance
from collections import defaultdict

In [4]:
answers = pd.read_csv("datasets/answers.csv", encoding="ISO-8859-1")

answers['body'] = answers['body'].apply(lambda text: preprocess(text))
answers.head()

Unnamed: 0,question_id,answer_id,is_accepted,body,user_id,reputation,score
0,67234961,67236964,True,"If there are multiple records for given key, y...",11641029.0,5078.0,0
1,35793622,67239365,False,If you end up here because you Googled &quot;e...,1740065.0,125.0,0
2,35793622,35794243,True,It seems to be an SDL issue. Here is a discuss...,1458617.0,10998.0,2
3,56035960,67239339,False,This could be an installation and setup proble...,10665896.0,11.0,0
4,56035960,56042011,True,As per java command line documentation: When y...,2897748.0,116880.0,1


In [5]:
# 1. Creating speakers
data_dir = "cornell movie-dialogs corpus/"
    
speaker_meta = {}
for index, row in answers.iterrows():
    speaker_meta[str(row['user_id'])] = {
        "user_id": row['user_id'],
        "reputation":row['reputation'],
        "score": row['score']
    }
    
corpus_speakers = {k: Speaker(id = k, meta = v) for k,v in speaker_meta.items()}
print("number of speakers in the data = {}".format(len(corpus_speakers)))
corpus_speakers['11641029.0'].meta

number of speakers in the data = 4674


{'user_id': 11641029.0, 'reputation': 5078.0, 'score': 0}

In [18]:
# 2. Creating utterance objects   
utterance_corpus = {}
count = 0
for index, row in answers.iterrows():
    utterance_corpus[str(index)] = Utterance(id=str(index), speaker=corpus_speakers[str(row['user_id'])], text=row['body'], meta={'answer_id': str(row['answer_id'])})
    
print("Total number of utterances = {}".format(len(utterance_corpus)))
utterance_corpus[str(0)]

Total number of utterances = 9149


Utterance({'obj_type': 'utterance', 'meta': {'answer_id': '67236964'}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {'user_id': 11641029.0, 'reputation': 5078.0, 'score': 0}, 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x136097ca0>, 'id': '11641029.0'}), 'conversation_id': None, 'reply_to': None, 'timestamp': None, 'text': "If there are multiple records for given key, you were overriding it everytime. Update your inner loop with following code. for (const [reportKey, reportValue] of Object.entries(value)) { // console.log(reportKey, reportValue); // this[&quot;reportKey&quot;] = reportKey; if ( reportKey.toUpperCase() != horizontal &amp;&amp; reportKey.toUpperCase() != vertical ) { // this[&quot;reportKey&quot;] = this.getMetricsName(this[&quot;reportKey&quot;]); let rK = this.getMetricsName(reportKey); // let dataSet = []; // dataSet.push( let dataset = { x: xAxis, y: yAxis, value: reportValue, &quot;x-axis&quot;: horizontal, &quot;y-axis&quot;

In [21]:
#  Updating root and reply_to information to utterances
with open(data_dir + "movie_conversations.txt", "r", encoding='utf-8', errors='ignore') as f:
    convo_data = f.readlines()

import ast

for index, row in answers.iterrows():
    utterance_corpus[str(index)].conversation_id = str(row['answer_id'])
    utterance_corpus[str(index)].reply_to = str(row['question_id'])
                
utterance_corpus[str(0)]

Utterance({'obj_type': 'utterance', 'meta': {'answer_id': '67236964'}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {'user_id': 11641029.0, 'reputation': 5078.0, 'score': 0}, 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x137b52af0>, 'id': '11641029.0'}), 'conversation_id': '67236964', 'reply_to': '67234961', 'timestamp': None, 'text': "If there are multiple records for given key, you were overriding it everytime. Update your inner loop with following code. for (const [reportKey, reportValue] of Object.entries(value)) { // console.log(reportKey, reportValue); // this[&quot;reportKey&quot;] = reportKey; if ( reportKey.toUpperCase() != horizontal &amp;&amp; reportKey.toUpperCase() != vertical ) { // this[&quot;reportKey&quot;] = this.getMetricsName(this[&quot;reportKey&quot;]); let rK = this.getMetricsName(reportKey); // let dataSet = []; // dataSet.push( let dataset = { x: xAxis, y: yAxis, value: reportValue, &quot;x-axis&quot;: horizontal, &quot;

In [22]:
# 3. Creating corpus from list of utterances
utterance_list = utterance_corpus.values()
answer_corpus = Corpus(utterances=utterance_list)
convo_ids = answer_corpus.get_conversation_ids()
for i, convo_idx in enumerate(convo_ids[0:5]):
    print("sample conversation {}:".format(i))
    print(answer_corpus.get_conversation(convo_idx).get_utterance_ids())

sample conversation 0:
['0']
sample conversation 1:
['1']
sample conversation 2:
['2']
sample conversation 3:
['3']
sample conversation 4:
['4']


In [26]:
# 4. Updating Conversation and Corpus level metadata
with open(data_dir + "movie_titles_metadata.txt", "r", encoding='utf-8', errors='ignore') as f:
    movie_extra = f.readlines()

movie_meta = defaultdict(dict)
for movie in movie_extra:
    movie_id, title, year, rating, votes, genre  = [info.strip() for info in movie.split("+++$+++")]
    movie_meta[movie_id] = {"movie_name": title,
                            "release_year": year,
                            "rating": rating,
                            "votes": votes,
                            "genre": genre}
    
for convo in movie_corpus.iter_conversations():
    
    # get the movie_id for the conversation by checking from utterance info
    convo_id = convo.get_id()
    movie_idx = movie_corpus.get_utterance(convo_id).meta['movie_id']
    
    # add movie idx as meta, and update meta with additional movie information
    convo.meta['movie_idx'] = movie_idx
    convo.meta.update(movie_meta[movie_idx])
movie_corpus.get_conversation("L609301").meta

with open(data_dir + "raw_script_urls.txt", "r", encoding='utf-8', errors='ignore') as f:
    urls = f.readlines()
    
movie2url = {}
for movie in urls:
    movie_id, _, url = [info.strip() for info in movie.split("+++$+++")]
    movie2url[movie_id] = url
    
movie_corpus.meta['url'] = movie2url
movie_corpus.meta['name'] = "Cornell Movie-Dialogs Corpus"

KeyError: '67236964'

In [8]:


with open(data_dir + "raw_script_urls.txt", "r", encoding='utf-8', errors='ignore') as f:
    urls = f.readlines()
    
movie2url = {}
for movie in urls:
    movie_id, _, url = [info.strip() for info in movie.split("+++$+++")]
    movie2url[movie_id] = url
    
movie_corpus.meta['url'] = movie2url
movie_corpus.meta['name'] = "Cornell Movie-Dialogs Corpus"

# 5. Processing utterance texts
from convokit.text_processing import TextParser
parser = TextParser(verbosity=10000)
movie_corpus = parser.transform(movie_corpus)
movie_corpus.get_utterance('L666499').retrieve_meta('parsed')

# 6. Saving created datasets
movie_corpus.dump("movie-corpus")
from convokit import meta_index
import os.path
meta_index(filename = os.path.join(os.path.expanduser("~"), ".convokit/saved-corpora/movie-corpus"))

  0%|          | 0/304713 [00:00<?, ?it/s]


KeyError: 'u0'

In [None]:
wiki_corpus = Corpus('data_transformation.ipynb')

parser = TextParser(verbosity=1000)

ps = PolitenessStrategies()
wiki_corpus = ps.transform(wiki_corpus, markers=True)
wiki_corpus.get_utterance('434044').meta

In [None]:
pip install politeness==0.1.2

In [None]:
from politeness.helpers import set_corenlp_url
set_corenlp_url('some-url.org:1234')