### Politeness prediction with ConvoKit

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict

In [2]:
try:
    import convokit
except ModuleNotFoundError:
    !pip install convokit

In [3]:
import convokit

In [4]:
from convokit import Corpus, Speaker, Utterance
from convokit import download

In [5]:
from pandas import DataFrame
from typing import List, Dict, Set

In [24]:
import random
from sklearn import svm
from scipy.sparse import csr_matrix
from sklearn.metrics import classification_report

In [27]:
from convokit import Classifier

### Train the initial classifer on the Wikipedia politeness data

In [30]:
# Downloading the wikipedia portion of annotated data
wiki_corpus = Corpus(download("wikipedia-politeness-corpus"))

Dataset already exists at /root/.convokit/downloads/wikipedia-politeness-corpus


In [31]:
parser = TextParser(verbosity=1000)
wiki_corpus = parser.transform(wiki_corpus)

ps = PolitenessStrategies()
wiki_corpus = ps.transform(wiki_corpus, markers=True)

1000/4353 utterances processed
2000/4353 utterances processed
3000/4353 utterances processed
4000/4353 utterances processed
4353/4353 utterances processed


As a prestep, we subset the corpus as we will only consider the polite vs. impolite class for prediction (i.e., those with "Binary" field being either +1 or -1).

In [32]:
binary_corpus = Corpus(utterances=[utt for utt in wiki_corpus.iter_utterances()])

##### Training the ConvoKit classifier

In [40]:
test_ids = binary_corpus.get_utterance_ids()[-100:]
train_corpus = Corpus(utterances=[utt for utt in binary_corpus.iter_utterances() if utt.id not in test_ids])
test_corpus = Corpus(utterances=[utt for utt in binary_corpus.iter_utterances() if utt.id in test_ids])
print("train size = {}, test size = {}".format(len(train_corpus.get_utterance_ids()),
                                               len(test_corpus.get_utterance_ids())))

train size = 4253, test size = 100


In [41]:
clf = Classifier(obj_type="utterance",
                        pred_feats=["politeness_strategies"],
                        labeller=lambda utt: utt.meta['Binary'] == 1)
clf.fit(train_corpus)

Initialized default classification model (standard scaled logistic regression).


<convokit.classifier.classifier.Classifier at 0x78b7698ce710>

In [42]:
test_pred = clf.transform(test_corpus)

### Predict on the emails

In [None]:
# get email data
file_folder = "normal_emails"
file_path = "/content/drive/MyDrive/COMP550/convokit_corpora"

email_corpus = Corpus(f"{file_path}/{file_folder}")

In [None]:
parser = TextParser(verbosity=1000)
email_corpus = parser.transform(email_corpus)

ps = PolitenessStrategies()
email_corpus = ps.transform(email_corpus)

In [162]:
# get politeness predictions for emails
email_pred = clf.transform(email_corpus)

In [163]:
politeness_df = clf.summarize(email_pred).sort_index().rename(columns={'prediction': "Politeness", 'pred_score': "Politeness Score"})
politeness_df

Unnamed: 0_level_0,Politeness,Politeness Score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2016,1,0.737915
5221,0,0.220352
5291,0,0.085171
8072,0,0.131991
8074,0,0.298256
...,...,...
512652,0,0.092979
512719,0,0.220352
514115,0,0.315406
514641,0,0.117411


In [164]:
orig_data_filepath = "/content/drive/MyDrive/COMP550/data"
file_name = f"{file_folder}.csv"

df = pd.read_csv(f"{orig_data_filepath}/{file_name}", index_col='Original Index')

In [165]:
df = df.join(politeness_df)

In [167]:
politeness_data_filepath = "/content/drive/MyDrive/COMP550/data_with_politeness"
df.to_csv(f"{politeness_data_filepath}/{file_name}")