In [None]:
import requests, os
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk import bigrams, trigrams
from sklearn.feature_extraction.text import CountVectorizer
from textstat import flesch_reading_ease # exploring text complixity
# for sentiment analysis
from textblob import TextBlob
import spacy

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
folder_path = 'data'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

for i in ['train.jsonl', 'test.jsonl', 'validation.jsonl']:
    filename = os.path.join(folder_path, i)
    if os.path.exists(filename):
        print(f" File {filename} already exists.")
    else:
        url = "https://github.com/DenisPeskov/2020_acl_diplomacy/raw/master/data/"+i
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as file:
                file.write(response.content)
            print(f"File '{filename}' has been downloaded and saved to '{folder_path}'.")
            # join all three files into a file for feature extraction, EDA and cleaning 
            with open(folder_path + '/all_data.jsonl', 'ab') as file:
                file.write(response.content)
            print(f"File '{filename}' added to all_data file.")
        else:
            print(f"Failed to download {url}. Status code: {response.status_code}")
    

In [None]:
df=pd.read_json('data/all_data.jsonl', lines=True)
df.head(2)

In [None]:
game_ids = []
senders = []
receivers = []
messages = []
sender_labels = []
receiver_labels = []
scores = []


for i, j in df.iterrows():       
    for c, d in enumerate(j['messages']):
        #print(c)
        game_ids.append(j['game_id'])
        senders.append(j['speakers'][c])
        receivers.append(j['receivers'][c])
        messages.append(j['messages'][c])
        sender_labels.append(j['sender_labels'][c])
        receiver_labels.append(j['receiver_labels'][c])
        scores.append(j['game_score'][c])
        
new_df = pd.DataFrame({'game_id': game_ids, 'sender': senders, 'receiver' : receivers, 'message': messages ,
                       'sender_label': sender_labels, 'receiver_label': receiver_labels, 'score': scores })
new_df

In [None]:
new_df[new_df['game_id'] == 5 ]

In [None]:
def text_preprocess(message):
    processed = []   
    for text in message:
        # replaace URLs
        text = re.sub(r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)",'URL',text)  
        
        # Replace all non alphabets.
        #text = re.sub( "[^a-zA-Z0-9]", " ", text)  # this will replace emojies as well!
        
        # Remove HTML/XML tags (if any)
        text = re.sub(r'<.*?>', '', text)

        # Remove punctuation and symbols -- (not for now)
        # text = re.sub(r'[^\w\s]', '', text) 

        # Remove numbers -- not for now
        #text = re.sub(r'\d+', '', text)

        # Remove whitespaces (including new lines and tabs)
        text = text.strip().replace("\n", " ").replace("\r", " ").replace("\t", " ")
        
        processed.append(text)       
    return processed

new_df['processed_message'] = text_preprocess(list(new_df['message']))

In [None]:
new_df[new_df['processed_message'] == 'URL']

---
#### Feature Extraction:

+ number of words in the sentence
+ frequency of functional words in the sentence
+ frequency of certain Parts-of-Speech (PoS) tags in the sentence such as:
+ + frequency of pronouns (total, first person, second person, third person)
+ + negations (e.g. use of "not"-type words)
+ + frequency of articles
+ + frequency of prepositions
+ frequency of certain N-grams in the sentence
+ sentence complexity.

+ frequency of words used found in some curated, potentially domain-specific dictionary
+ frequency of words of length > 6
+ sentiment
+ certainty (from self-doubting to absolutely certain)
+ implied causality
+ inclusivity/exclusivity
+ frequency of motion verbs
+ frequency of past tense verbs
+ frequency of future tense verbs
+ word/sentence embeddings (from e.g. bert-base encoder).

In [None]:
# number of words in the sentence
new_df['num_words_sentence'] = new_df['processed_message'].apply(lambda x: len(x.split()))
new_df[['processed_message' , 'num_words_sentence']]

---
#### frequency of functional words in the sentence --> (stop words!?)

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def count_stop_words(sentence):
    words = sentence.split()
    stop_words_in_sentence = [word for word in words if word.lower() in stop_words]
    return len(stop_words_in_sentence)

new_df['stop_words_count'] = new_df['processed_message'].apply(count_stop_words)
new_df[['processed_message' , 'stop_words_count']]

---
#### frequency of certain Parts-of-Speech (PoS) tags in the sentence

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#Noun (NN)
#Verb (VB)
#Adjective(JJ)
#Adverb(RB)
#Preposition (IN)
#Conjunction (CC)
#Pronoun(PRP)
#Interjection (INT)

# here i get all pos!
# the result of this function would be json which wont be really useful as a feature
def pos_tag_frequency(sentence):
    words = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    #select the second element in each tuple
    tags = [tag for word, tag in pos_tags if tag == 'PRP']
    tag_freq = Counter(tags)
    return tag_freq

#new_df['pos_tag_freq'] = new_df['processed_message'].apply(pos_tag_frequency)
#new_df[['processed_message' , 'pos_tag_freq']]

pron_freq = []
article_freq = []
prep_freq = []
adj_freq = []

for i, j in new_df.iterrows():       
    words = word_tokenize(j['processed_message'])
    pos_tags = nltk.pos_tag(words)
    pron_freq.append(len([tag for word, tag in pos_tags if tag == 'PRP']))
    article_freq.append(sum([1 for word in words if word.lower() in ['the', 'a', 'an']]))
    prep_freq.append(len([tag for word, tag in pos_tags if tag == 'IN']))
    adj_freq.append(len([tag for word, tag in pos_tags if tag == 'JJ']))
        
new_df['pron_freq'] = pron_freq
new_df['article_freq'] = article_freq
new_df['prep_freq'] = prep_freq
new_df['adj_freq'] = adj_freq


In [None]:
new_df[['processed_message' , 'pron_freq', 'article_freq', 'prep_freq', 'adj_freq']]

---
#### frequency of certain N-grams in the sentence

In [None]:
# should actually check how many of top n-grams each sentenc contains?

In [105]:
#identify the top N n-grams from text
def get_top_ngram(corpus, n=None, m=20):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]       
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:m]

In [106]:
top_n_bigrams=get_top_ngram(new_df['processed_message'],2)
top_n_bigrams

[('if you', 1320),
 ('in the', 1175),
 ('going to', 891),
 ('you re', 761),
 ('to be', 691),
 ('want to', 667),
 ('do you', 623),
 ('you can', 570),
 ('we can', 553),
 ('would be', 515),
 ('need to', 469),
 ('to do', 458),
 ('to get', 454),
 ('are you', 447),
 ('of the', 442),
 ('that you', 437),
 ('you and', 431),
 ('you have', 419),
 ('me to', 412),
 ('on the', 411)]

In [107]:
top_n_trigrams=get_top_ngram(new_df['processed_message'],3)
top_n_trigrams

[('be able to', 239),
 ('do you think', 201),
 ('you want to', 199),
 ('what do you', 190),
 ('let me know', 162),
 ('if you re', 158),
 ('in the fall', 144),
 ('support you into', 144),
 ('to work with', 142),
 ('at this point', 137),
 ('if you want', 131),
 ('in the north', 127),
 ('going to be', 119),
 ('do you want', 115),
 ('don want to', 112),
 ('the north sea', 111),
 ('it would be', 110),
 ('we need to', 110),
 ('what are your', 106),
 ('is going to', 104)]

In [135]:
# check if the sentence includes top bigrams or trigrams:
top_bigrams = [i[0] for i in top_n_bigrams]
top_trigrams =  [i[0] for i in top_n_trigrams]

def count_ngrams(sentence, n):
    words = word_tokenize(sentence)
    separator = ' '
    if n == 2:
        n_grams = sum([1 if separator.join(i) in top_bigrams else 0 for i in list(bigrams(words))])
    elif n == 3:
        n_grams = sum([1 if separator.join(i) in top_trigrams else 0 for i in list(trigrams(words))])
    else:
        return 0
    return n_grams

new_df['top_bigram_freq'] = new_df['processed_message'].apply(lambda x: count_ngrams(x, 2))
new_df['top_trigram_freq'] = new_df['processed_message'].apply(lambda x: count_ngrams(x, 3))

In [136]:
new_df[['processed_message' , 'top_bigram_freq' , 'top_trigram_freq']]

Unnamed: 0,processed_message,top_bigram_freq,top_trigram_freq
0,Germany! Just the person I want to speak with...,1,0
1,"You've whet my appetite, Italy. What's the sug...",0,0
2,👍,0,0
3,It seems like there are a lot of ways that cou...,0,0
4,"Yeah, I can’t say I’ve tried it and it works, ...",2,0
...,...,...,...
17284,You and Austria are the most importand. Italy ...,6,1
17285,"Hello, Turkey?",0,0
17286,Hello???,0,0
17287,"Helloooo, turkey",0,0


In [137]:
new_df[new_df['top_trigram_freq'] > 0]

Unnamed: 0,game_id,sender,receiver,message,sender_label,receiver_label,score,processed_message,num_words_sentence,stop_words_count,...,sentiment,motion_verb_count,num_past_tense,num_future_tense,pron_freq,article_freq,prep_freq,adj_freq,top_bigram_freq,top_trigram_freq
12,1,italy,germany,"Well, at least I have an idea of who to trust....",True,True,3,"Well, at least I have an idea of who to trust....",116,57,...,0.089216,7,0,0,10,10,13,8,1,1
25,1,italy,germany,How are things going with England? I think tha...,True,True,4,How are things going with England? I think tha...,20,11,...,0.083333,1,0,0,3,1,3,1,0,1
47,1,italy,germany,Two bits of advice: #1 I suggest you tell Russ...,True,NOANNOTATION,6,Two bits of advice: #1 I suggest you tell Russ...,88,42,...,0.050000,5,1,0,15,1,8,3,2,2
48,1,italy,germany,#2 Here is the move set I would suggest right ...,True,NOANNOTATION,6,#2 Here is the move set I would suggest right ...,82,37,...,0.261905,3,0,0,9,1,7,1,1,1
51,1,germany,italy,I think me and England are really on the same ...,True,True,4,I think me and England are really on the same ...,66,33,...,0.129563,2,1,0,7,3,11,4,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17228,11,turkey,russia,"Well, I have no issue with you guaranteeing yo...",True,NOANNOTATION,4,"Well, I have no issue with you guaranteeing yo...",58,31,...,0.000000,3,1,0,11,3,5,2,2,1
17251,11,france,russia,Germany is going to move into sweden with engl...,True,NOANNOTATION,6,Germany is going to move into sweden with engl...,17,5,...,0.000000,2,0,1,0,0,2,2,1,1
17256,11,france,england,"Hey England, how are you? I would like to disc...",False,True,3,"Hey England, how are you? I would like to disc...",81,39,...,0.270833,3,0,0,11,5,6,6,4,1
17265,11,england,france,how do you feel about supporting me into the n...,True,True,2,how do you feel about supporting me into the n...,11,7,...,0.250000,0,0,0,2,1,2,1,1,1


---
#### sentence complexity

In [None]:
# flesch_reading_ease => 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllables / total_words)
new_df['text_ease'] = new_df['processed_message'].apply(lambda x : flesch_reading_ease(x))
new_df[new_df['text_ease']== new_df['text_ease'].max()]

linguistic complexity is a broad field and there are many other factors could be considered, like the use of passive voice, nominalizations, advanced punctuation, lexical density, and so on.

In [None]:
#frequency of words used found in some curated, potentially domain-specific dictionary
# This is challenging, which domain should be explored??
# wordnet from nltk.corpus might be useful if know which domain we are exploring!

In [None]:
#frequency of words of length > 6
#get the number of words longer than 6 in each message
new_df['num_long_words'] = new_df['processed_message'].str.split().apply(lambda x : [1 if len(i) > 6 else 0 for i in x]).map(lambda x: sum(x))
new_df[['processed_message' , 'num_long_words']]

In [None]:
#sentiment
# textblob is more sensitive to negative sentiments than nltk
new_df['sentiment'] = new_df['processed_message'].apply(lambda x: TextBlob(x).sentiment.polarity)  
new_df[['processed_message' , 'sentiment']]

---
#### certainty (from self-doubting to absolutely certain)

This one cannot be implemented simply using nlp tools like nltk or by ML models eaither fine-tunned pre-traiined ones or those traind on a labled dataset.

---
#### implied causality

---
#### inclusivity/exclusivity

---
#### frequency of motion verbs

In [138]:
# get the list of all verbs in the dataset
nlp = spacy.load("en_core_web_sm")
def get_unique_verbs(texts):
    unique_verbs = set()  #  set to store unique verbs
    for text in texts:
        doc = nlp(text)
        # add the lemmatized form of the word to the set if it is a verb.
        unique_verbs.update(token.lemma_ for token in doc if token.pos_ == "VERB")
    return unique_verbs

verbs = list(get_unique_verbs(new_df['processed_message']))

In [139]:
print(verbs)

['copy', 'accept', 'havta', 'apply', 'approach', 'view', 'protect', 'wack', 'would', 'invite', 'live', 'trample', 'sus', 'lower', 'plug', 'raise', 'identify', 'insult', 'debate', 'hardcappe', 'stillcgold', 'seek', 'hatch', 'go', 'crumble', 'unapose', 'ease', 'decline', 'oblige', 'vindicate', 'sit', 'starve', 'takey', 'take', 'schedule', 'pivot', 'dispose', 'board', 'outta', 'neighbor', 'find', 'bait', 'return', 'finish', 'bank', 'propose-', 'garunteee', 'offset', 'relish', 'spare', 'observe', 'beef', 'volunteer', 'reread', 'cast', 'scan', 'exceed', 'gain', 'say', 'prefer', 'begrudge', 'side', 'yank', 'sil', 'announce', 'break', 'see-', 'dunno', 'doxxe', 'hesitate', 'dip', 'die', 'solve', 'waffle', 'gear', 'flower', 'breakup', 'CANT', 'command', 'wanna', 'laugh', 'read', 'extract', 'know', 'accord', 'bar', 'leverage', 'induce', 'devote', 'present', 'alarm', 'devour', 'pile', 'court', 'stomp', 'last', 'crack', 'recall', 'establish', 'prod', 'compound', 'win', 'disentangle', 'reserve', 'f

In [140]:
# extract motion verbs from the list of all verbs using Chat-Gpt
motion_verbs = ['pitch', 'run', 'shudder', 'chase', 'track', 'arise', 'shrink', 'ride', 'roll', 'fly', 'bounce', 'mow', 'cut',
                'regroup', 'chill', 'overrun', 'descend', 'explode', 'switch', 'hug', 'eat', 'jiggle', 'dance', 'park', 'sit', 
                'seek', 'return', 'gallivant', 'transit', 'dive', 'pop', 'bump', 'send', 'steal', 'wash', 'walk', 'swing', 
                'float', 'squeeze', 'scroll', 'stick', 'poke', 'catch', 'evacuate', 'melt', 'raise', 'board', 'pour', 'move', 
                'scramble', 'hang', 'trigger', 'fling', 'crash', 'start', 'dry', 'answer', 'stall', 'peel', 'capture', 'run', 
                'box', 'end', 'look', 'try', 'hover', 'jump', 'acquire', 'tie', 'dip', 'recover', 'explore', 'fish', 'hop',
                'take', 'execute', 'rob', 'launch', 'arrive', 'collect', 'harm', 'do', 'hand', 'gather', 'give', 'split', 
                'trip', 'close', 'part', 'snap', 'bury', 'rush', 'greet', 'weigh', 'swing', 'pay', 'enter', 'put', 'separate',
                'unite', 'fix', 'wrap', 'skate', 'tell', 'lead', 'rescue', 'eject', 'perceive', 'rum', 'loosen', 'welcome',
                'break', 'tread', 'speak', 'freak', 'protect', 'impose', 'explain', 'attend', 'prepare', 'tend', 'upgrade', 
                'advance', 'sneak', 'kick', 'consume', 'continue', 'train', 'drag', 'hint', 'barge', 'volunteer', 'swoop',
                'rotate', 'head', 'live', 'match', 'exit', 'ward', 'string', 'tease', 'side', 'offer', 'pull', 'turn', 'shut', 
                'laugh', 'extract', 'click', 'overcome', 'race', 'grill', 'lay', 'declare', 'ski', 'face', 'snag', 'deceive',
                'survive', 'design', 'spin', 'build', 'discard', 'slam', 'reconsider', 'have', 'pin', 'realize', 'foreclose', 
                'emerge', 'provide', 'picture', 'ascertain', 'pass', 'work', 'cascade', 'emerge', 'contact', 'alter', 'receive',
                'dig', 'interpret', 'mop', 'spread', 'need', 'plan', 'smell', 'hit', 'object', 'block', 'pretend', 'empty', 
                'blow', 'bind', 'stab', 'reason', 'watch', 'standby', 'crave', 'verify', 'corner', "approach","arrive","ascend",
                "backpedal","balance","bounce","climb","crawl","creep","dance","descend","dodge","drag","drift","drive","fall",
                "float","fly","gallop","glide","hop","jump","kick","leap","march","move","paddle","pedal","pull","push","run",
                "sail","scoot","skate","slide","spin","split","sprint","stand","stop","stroll","swim","swing","trot","walk",
                "waltz","wiggle","zoom",]

In [141]:
nlp = spacy.load("en_core_web_sm")
def count_motion_verbs(text):
    doc = nlp(text)
    # count motion verbs
    count = sum(1 for token in doc if token.pos_ == "VERB" and token.lemma_ in motion_verbs)
    return count

new_df['motion_verb_count'] = new_df['processed_message'].apply(count_motion_verbs)

In [142]:
new_df[['processed_message', 'motion_verb_count']]

Unnamed: 0,processed_message,motion_verb_count
0,Germany! Just the person I want to speak with...,4
1,"You've whet my appetite, Italy. What's the sug...",0
2,👍,0
3,It seems like there are a lot of ways that cou...,3
4,"Yeah, I can’t say I’ve tried it and it works, ...",11
...,...,...
17284,You and Austria are the most importand. Italy ...,5
17285,"Hello, Turkey?",0
17286,Hello???,0
17287,"Helloooo, turkey",0


---
#### frequency of past tense verbs

In [143]:
nltk.download('averaged_perceptron_tagger')
def get_past_tense_verbs(text):
    word_tokens = word_tokenize(text)
    tagged = nltk.pos_tag(word_tokens)
    past_tense_verbs = [word for word, pos in tagged if pos == 'VBD']
    return len(past_tense_verbs)

new_df['num_past_tense'] = new_df['processed_message'].apply(get_past_tense_verbs)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\766619\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [144]:
new_df[['processed_message', 'num_past_tense']]

Unnamed: 0,processed_message,num_past_tense
0,Germany! Just the person I want to speak with...,2
1,"You've whet my appetite, Italy. What's the sug...",0
2,👍,0
3,It seems like there are a lot of ways that cou...,0
4,"Yeah, I can’t say I’ve tried it and it works, ...",3
...,...,...
17284,You and Austria are the most importand. Italy ...,0
17285,"Hello, Turkey?",0
17286,Hello???,0
17287,"Helloooo, turkey",0


---
#### frequency of future tense verbs

In [145]:
# Load the English language model
nlp = spacy.load('en_core_web_sm')

In [146]:
def get_future_tense_verbs(text):
    doc = nlp(text)
    future_tense_verbs = []
    for token in doc:
        # check if the token is not the first or the last in the doc
        if token.i > 0 and token.i < len(doc) - 1:
            # handling the case for "will" and "shall"
            if token.lower_ in ["will", "shall"]:
                next_token = token.nbor()  # The token after "will" or "shall"
                if next_token.pos_ == "VERB":
                    future_tense_verbs.append(next_token.text)
            # handling the case for "is going to" --------------- should also handle "are going to" ------------
            elif token.lower_ == "going" and token.nbor(-1).lower_ == "is" and token.nbor(1).lower_ == "to":
                next_token = token.nbor(1)  # the token after "going", expected "to"
                next_next_token = next_token.nbor()  # the token after "to"
                if next_next_token.pos_ == "VERB":
                    future_tense_verbs.append(next_next_token.text)
    # for the case where "will" or "shall" is the last word in a sentence
    for sent in doc.sents:
        last_word = sent[-1]
        if last_word.lower_ in ["will", "shall"]:
            # handle cases or add context-specific actions
            pass  # or add to list if contextually correct in sentences like "Tomorrow, I will."
    return len(future_tense_verbs)
 

new_df['num_future_tense'] = new_df['processed_message'].apply(get_future_tense_verbs)

In [147]:
new_df[['processed_message', 'num_future_tense']]

Unnamed: 0,processed_message,num_future_tense
0,Germany! Just the person I want to speak with...,0
1,"You've whet my appetite, Italy. What's the sug...",0
2,👍,0
3,It seems like there are a lot of ways that cou...,0
4,"Yeah, I can’t say I’ve tried it and it works, ...",0
...,...,...
17284,You and Austria are the most importand. Italy ...,0
17285,"Hello, Turkey?",0
17286,Hello???,0
17287,"Helloooo, turkey",0


In [148]:
new_df[new_df['num_future_tense'] > 0]

Unnamed: 0,game_id,sender,receiver,message,sender_label,receiver_label,score,processed_message,num_words_sentence,stop_words_count,...,sentiment,motion_verb_count,num_past_tense,num_future_tense,pron_freq,article_freq,prep_freq,adj_freq,top_bigram_freq,top_trigram_freq
13,1,italy,germany,Just an FYI: I’ve now had both England and Fra...,True,True,3,Just an FYI: I’ve now had both England and Fra...,80,44,...,-0.053571,4,1,1,11,4,7,2,3,0
63,1,germany,italy,Okay—sorry for being nosy! I will try for bur ...,True,True,4,Okay—sorry for being nosy! I will try for bur ...,18,11,...,0.000000,1,0,1,2,1,3,1,1,0
80,1,germany,italy,"You know italy, I think we *do* need to coordi...",True,True,5,"You know italy, I think we *do* need to coordi...",50,25,...,0.183333,3,0,1,4,1,7,2,1,0
84,1,germany,italy,It looks like England's not willing to try for...,True,True,5,It looks like England's not willing to try for...,48,19,...,-0.031250,5,0,1,6,2,7,3,2,0
98,1,italy,germany,I mean it sincerely. I think that England will...,True,True,7,I mean it sincerely. I think that England will...,40,18,...,0.177500,2,0,1,8,0,5,4,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17221,11,russia,turkey,Germany will help take care of Austria if we w...,True,True,4,Germany will help take care of Austria if we w...,13,6,...,0.000000,1,0,1,1,0,2,0,1,0
17235,11,turkey,russia,Sometimes that will happen when no one hears f...,True,True,4,Sometimes that will happen when no one hears f...,19,10,...,0.000000,0,0,1,0,1,3,1,0,0
17251,11,france,russia,Germany is going to move into sweden with engl...,True,NOANNOTATION,6,Germany is going to move into sweden with engl...,17,5,...,0.000000,2,0,1,0,0,2,2,1,1
17262,11,france,england,"Ok, I didn't think Germany would betray me but...",False,True,8,"Ok, I didn't think Germany would betray me but...",31,16,...,0.650000,5,2,1,5,0,1,0,0,0


---
#### word/sentence embeddings (from e.g. bert-base encoder).