# Lexical Features
- Term frequency for tagged entities
- Number of links
- Number of code snippets
- Measures of politeness

## Politeness

In [2]:
import convokit
from convokit import Corpus, Speaker, Utterance
from convokit import TextParser
from convokit import download
from convokit import PolitenessStrategies
import pandas as pd

In [3]:
from tqdm import tqdm
from convokit import Corpus, Speaker, Utterance
from collections import defaultdict

In [1]:
answers = pd.read_csv("datasets/answers.csv", encoding="ISO-8859-1")
answers.head()

NameError: name 'pd' is not defined

In [18]:
# 1. Creating speakers
data_dir = "cornell movie-dialogs corpus/"
with open(data_dir + "movie_characters_metadata.txt", "r", encoding='utf-8', errors='ignore') as f:
    speaker_data = f.readlines()
    
speaker_meta = {}
for speaker in speaker_data:
    speaker_info = [info.strip() for info in speaker.split("+++$+++")]
    speaker_meta[speaker_info[0]] = {
        "user_name": speaker_info[1],
        "reputation": speaker_info[2],
        "score": speaker_info[3]
     }
    
corpus_speakers = {k: Speaker(id = k, meta = v) for k,v in speaker_meta.items()}
print("number of speakers in the data = {}".format(len(corpus_speakers)))
corpus_speakers['u0'].meta

number of speakers in the data = 9035


{'user_name': 'BIANCA',
 'reputation': 'm0',
 'score': '10 things i hate about you'}

In [20]:
# 2. Creating utterance objects
with open(data_dir + "movie_lines.txt", "r", encoding='utf-8', errors='ignore') as f:
    utterance_data = f.readlines()
    
utterance_corpus = {}
count = 0
for utterance in tqdm(utterance_data):
    utterance_info = [info.strip() for info in utterance.split("+++$+++")]
    if len(utterance_info) < 4:
        print(utterance_info)
    try:
        idx, speaker, movie_id, text = utterance_info[0], utterance_info[1], utterance_info[2], utterance_info[4]
    except:
        print(utterance_info)
    meta = {'movie_id': movie_id}
    # root & reply_to will be updated later, timestamp is not applicable 
    utterance_corpus[idx] = Utterance(id=idx, speaker=corpus_speakers[speaker], text=text, meta=meta)
print("Total number of utterances = {}".format(len(utterance_corpus)))
utterance_corpus['L1044']

#  Updating root and reply_to information to utterances
with open(data_dir + "movie_conversations.txt", "r", encoding='utf-8', errors='ignore') as f:
    convo_data = f.readlines()

import ast

for info in tqdm(convo_data):
    speaker1, speaker2, m, convo = [info.strip() for info in info.split("+++$+++")]
    convo_seq = ast.literal_eval(convo)
    # update utterance
    conversation_id = convo_seq[0]
    # convo_seq is a list of utterances ids, arranged in conversational order
    for i, line in enumerate(convo_seq):
        # sanity checking: speaker giving the utterance is indeed in the pair of characters provided
        if utterance_corpus[line].speaker.id not in [speaker1, speaker2]:
            print("speaker mismatch in line {0}".format(i))
        utterance_corpus[line].conversation_id = conversation_id
        if i == 0:
            utterance_corpus[line].reply_to = None
        else:
            utterance_corpus[line].reply_to = convo_seq[i-1]
            
utterance_corpus['L666499']

100%|██████████| 304713/304713 [00:02<00:00, 152262.02it/s]
  6%|▌         | 4625/83097 [00:00<00:01, 46243.43it/s]

Total number of utterances = 304713


100%|██████████| 83097/83097 [00:01<00:00, 44867.99it/s]


Utterance({'obj_type': 'utterance', 'meta': {'movie_id': 'm616'}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {'user_name': 'COGHILL', 'reputation': 'm616', 'score': 'zulu dawn'}, 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x134f6f070>, 'id': 'u9028'}), 'conversation_id': 'L666497', 'reply_to': 'L666498', 'timestamp': None, 'text': 'How quickly can you move your artillery forward?', 'owner': None, 'id': 'L666499'})

In [19]:


# 3. Creating corpus from list of utterances
utterance_list = utterance_corpus.values()
movie_corpus = Corpus(utterances=utterance_list)
convo_ids = movie_corpus.get_conversation_ids()
for i, convo_idx in enumerate(convo_ids[0:5]):
    print("sample conversation {}:".format(i))
    print(movie_corpus.get_conversation(convo_idx).get_utterance_ids())
    
# 4. Updating Conversation and Corpus level metadata
with open(data_dir + "movie_titles_metadata.txt", "r", encoding='utf-8', errors='ignore') as f:
    movie_extra = f.readlines()

movie_meta = defaultdict(dict)
for movie in movie_extra:
    movie_id, title, year, rating, votes, genre  = [info.strip() for info in movie.split("+++$+++")]
    movie_meta[movie_id] = {"movie_name": title,
                            "release_year": year,
                            "rating": rating,
                            "votes": votes,
                            "genre": genre}
    
for convo in movie_corpus.iter_conversations():
    
    # get the movie_id for the conversation by checking from utterance info
    convo_id = convo.get_id()
    movie_idx = movie_corpus.get_utterance(convo_id).meta['movie_id']
    
    # add movie idx as meta, and update meta with additional movie information
    convo.meta['movie_idx'] = movie_idx
    convo.meta.update(movie_meta[movie_idx])
movie_corpus.get_conversation("L609301").meta

with open(data_dir + "raw_script_urls.txt", "r", encoding='utf-8', errors='ignore') as f:
    urls = f.readlines()
    
movie2url = {}
for movie in urls:
    movie_id, _, url = [info.strip() for info in movie.split("+++$+++")]
    movie2url[movie_id] = url
    
movie_corpus.meta['url'] = movie2url
movie_corpus.meta['name'] = "Cornell Movie-Dialogs Corpus"

# 5. Processing utterance texts
from convokit.text_processing import TextParser
parser = TextParser(verbosity=10000)
movie_corpus = parser.transform(movie_corpus)
movie_corpus.get_utterance('L666499').retrieve_meta('parsed')

# 6. Saving created datasets
movie_corpus.dump("movie-corpus")
from convokit import meta_index
import os.path
meta_index(filename = os.path.join(os.path.expanduser("~"), ".convokit/saved-corpora/movie-corpus"))

100%|██████████| 304713/304713 [00:15<00:00, 19795.28it/s] 
  5%|▌         | 4485/83097 [00:00<00:01, 44842.67it/s]

Total number of utterances = 304713


100%|██████████| 83097/83097 [00:01<00:00, 45936.79it/s]


sample conversation 0:
['L1045', 'L1044']
sample conversation 1:
['L985', 'L984']
sample conversation 2:
['L925', 'L924']
sample conversation 3:
['L872', 'L871', 'L870']
sample conversation 4:
['L869', 'L868', 'L867', 'L866']
10000/304713 utterances processed


KeyboardInterrupt: 

In [3]:
wiki_corpus = Corpus('data_transformation.ipynb')

parser = TextParser(verbosity=1000)

ps = PolitenessStrategies()
wiki_corpus = ps.transform(wiki_corpus, markers=True)
wiki_corpus.get_utterance('434044').meta

{'Normalized Score': 0.6945444785369653,
 'Binary': 1,
 'Annotations': {'A233ONYNWKDIYF': 17,
  'A1QV3X9YMQQ3OQ': 22,
  'A3OW54MEVDKXJL': 17,
  'A1JK9DYKWYZZEK': 17,
  'AZ1RJVNOZFIWV': 18},
 'parsed': [{'rt': 7,
   'toks': [{'tok': 'hey', 'tag': 'UH', 'dep': 'intj', 'up': 1, 'dn': []},
    {'tok': 'mbk', 'tag': 'NNP', 'dep': 'dep', 'up': 7, 'dn': [0]},
    {'tok': ';', 'tag': ':', 'dep': 'punct', 'up': 7, 'dn': []},
    {'tok': 'well', 'tag': 'UH', 'dep': 'intj', 'up': 7, 'dn': []},
    {'tok': ',', 'tag': ',', 'dep': 'punct', 'up': 7, 'dn': []},
    {'tok': 'i', 'tag': 'PRP', 'dep': 'nsubj', 'up': 7, 'dn': []},
    {'tok': "'ve", 'tag': 'VB', 'dep': 'aux', 'up': 7, 'dn': []},
    {'tok': 'got', 'tag': 'VBN', 'dep': 'ROOT', 'dn': [1, 2, 3, 4, 5, 6, 9]},
    {'tok': 'to', 'tag': 'TO', 'dep': 'aux', 'up': 9, 'dn': []},
    {'tok': 'be', 'tag': 'VB', 'dep': 'xcomp', 'up': 7, 'dn': [8, 10]},
    {'tok': 'honest', 'tag': 'JJ', 'dep': 'acomp', 'up': 9, 'dn': []}]},
  {'rt': 3,
   'toks': [{'