In [51]:
import requests, os
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk import bigrams, trigrams
from sklearn.feature_extraction.text import CountVectorizer
from textstat import flesch_reading_ease # exploring text complixity
# for sentiment analysis
from textblob import TextBlob
import spacy

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [15]:
folder_path = 'data'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    
if os.path.exists(folder_path + '/train.jsonl'):
    print(f"File already exists.")
else:
    url = "https://github.com/DenisPeskov/2020_acl_diplomacy/raw/master/data/train.jsonl"
    response = requests.get(url)

    if response.status_code == 200:
        filename = os.path.join(folder_path, os.path.basename(url))
        with open(filename, 'wb') as file:
            file.write(response.content)
        print(f"File '{filename}' has been downloaded and saved to '{folder_path}'.")
    else:
        print(f"Failed to download {url}. Status code: {response.status_code}")

File already exists.


In [16]:
df=pd.read_json('data/train.jsonl', lines=True)
df.head(2)

Unnamed: 0,messages,sender_labels,receiver_labels,speakers,receivers,absolute_message_index,relative_message_index,seasons,years,game_score,game_score_delta,players,game_id
0,[Germany!\n\nJust the person I want to speak w...,"[True, True, True, True, True, True, True, Tru...","[True, True, True, True, NOANNOTATION, NOANNOT...","[italy, germany, italy, germany, italy, italy,...","[germany, italy, germany, italy, germany, germ...","[74, 76, 86, 87, 89, 92, 97, 117, 119, 121, 12...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[Spring, Spring, Spring, Spring, Spring, Sprin...","[1901, 1901, 1901, 1901, 1901, 1901, 1901, 190...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[italy, germany]",1
1,[Hello there! What's your general plan for thi...,"[True, False, True, False, True, True, True, T...","[True, True, True, True, True, NOANNOTATION, T...","[austria, italy, austria, italy, italy, austri...","[italy, austria, italy, austria, austria, ital...","[1, 67, 71, 73, 98, 99, 101, 179, 181, 185, 18...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[Spring, Spring, Spring, Spring, Spring, Sprin...","[1901, 1901, 1901, 1901, 1901, 1901, 1901, 190...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 4, 4, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1, -1, -...","[italy, austria]",1


In [17]:
game_ids = []
senders = []
receivers = []
messages = []
sender_labels = []
receiver_labels = []
scores = []


for i, j in df.iterrows():       
    for c, d in enumerate(j['messages']):
        #print(c)
        game_ids.append(j['game_id'])
        senders.append(j['speakers'][c])
        receivers.append(j['receivers'][c])
        messages.append(j['messages'][c])
        sender_labels.append(j['sender_labels'][c])
        receiver_labels.append(j['receiver_labels'][c])
        scores.append(j['game_score'][c])
        
new_df = pd.DataFrame({'game_id': game_ids, 'sender': senders, 'receiver' : receivers, 'message': messages ,
                       'sender_label': sender_labels, 'receiver_label': receiver_labels, 'score': scores })
new_df

Unnamed: 0,game_id,sender,receiver,message,sender_label,receiver_label,score
0,1,italy,germany,Germany!\n\nJust the person I want to speak wi...,True,True,3
1,1,germany,italy,"You've whet my appetite, Italy. What's the sug...",True,True,3
2,1,italy,germany,👍,True,True,3
3,1,germany,italy,It seems like there are a lot of ways that cou...,True,True,3
4,1,italy,germany,"Yeah, I can’t say I’ve tried it and it works, ...",True,NOANNOTATION,3
...,...,...,...,...,...,...,...
13127,10,france,england,Is there any way of me actually ending this co...,True,True,4
13128,10,france,england,Can we agree on peace? What are your demands?,True,True,4
13129,10,england,france,"Neutrality in exchange for current holdings, S...",True,False,5
13130,10,france,england,"Thats a bit too much, can I keep Spain and i h...",True,True,4


In [18]:
new_df[new_df['game_id'] == 5 ]

Unnamed: 0,game_id,sender,receiver,message,sender_label,receiver_label,score
7834,5,germany,italy,"So, Austria crossed some lines we'd negotiated...",True,True,4
7835,5,austria,italy,Hello Italy! I chose you as the first country ...,True,True,3
7836,5,austria,italy,"All right I see how it works now :).\nAnyways,...",True,True,3
7837,5,austria,italy,I firmly believe that Italy/Austria is one of ...,True,True,3
7838,5,austria,italy,I happily await your response :),True,True,3
...,...,...,...,...,...,...,...
8256,5,turkey,france,Of course I can choose to develop north instea...,True,True,10
8257,5,france,turkey,How can I entice you to do that?,True,True,12
8258,5,france,turkey,Are you on board for a 3 way draw?,True,True,10
8259,5,turkey,france,I'm confused. What is there to gain through a ...,True,True,11


In [37]:
def text_preprocess(message):
    processed = []   
    for text in message:
        # replaace URLs
        text = re.sub(r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)",' <URL>',text)  
        
        # Replace all non alphabets.
        #text = re.sub( "[^a-zA-Z0-9]", " ", text)  # this will replace emojies as well!
        
        # Remove HTML/XML tags (if any)
        text = re.sub(r'<.*?>', '', text)

        # Remove punctuation and symbols (not for now)
        # text = re.sub(r'[^\w\s]', '', text) 

        # Remove numbers
        text = re.sub(r'\d+', '', text)

        # Remove whitespaces (including new lines and tabs)
        text = text.strip().replace("\n", " ").replace("\r", " ").replace("\t", " ")
        
        processed.append(text)       
    return processed

new_df['processed_message'] = text_preprocess(list(new_df['message']))

In [41]:
new_df[new_df['processed_message'] == '']

Unnamed: 0,game_id,sender,receiver,message,sender_label,receiver_label,score,processed_message,num_words_sentence,stop_words_count,pos_tag_freq,bigram_freq,trigram_freq,text_complexity
3112,2,italy,austria,https://www.stitcher.com/podcast/diplomacy-gam...,True,True,3,,0,0,{},{},{},206.84
3137,2,austria,italy,https://diplomacy.fandom.com/wiki/The_Blue_Wat...,True,True,3,,0,0,{},{},{},206.84
3164,2,austria,italy,https://discord.gg/P5vyvz8,True,True,3,,0,0,{},{},{},206.84
4323,2,austria,germany,https://discord.gg/P5vyvz8,True,True,5,,0,0,{},{},{},206.84
5491,2,austria,russia,https://discord.gg/P5vyvz8,True,True,5,,0,0,{},{},{},206.84
6819,3,germany,england,https://m.youtube.com/watch?v=VBmMU_iwe6U,True,True,3,,0,0,{},{},{},206.84
8328,6,germany,england,https://www.backstabbr.com/sandbox/62031255626...,True,True,3,,0,0,{},{},{},206.84
8600,6,austria,turkey,https://tenor.com/view/turkey-slice-cut-thanks...,True,True,3,,0,0,{},{},{},206.84
9891,8,russia,italy,https://tenor.com/view/why-huh-but-why-gif-131...,True,NOANNOTATION,5,,0,0,{},{},{},206.84
10353,8,russia,turkey,https://media.tenor.com/images/d74df5ad5b63cc9...,True,NOANNOTATION,5,,0,0,{},{},{},206.84


---
#### Feature Extraction:

+ number of words in the sentence
+ frequency of functional words in the sentence
+ frequency of certain Parts-of-Speech (PoS) tags in the sentence such as:
+ + frequency of pronouns (total, first person, second person, third person)
+ + negations (e.g. use of "not"-type words)
+ + frequency of articles
+ + frequency of prepositions
+ frequency of certain N-grams in the sentence
+ sentence complexity.

+ frequency of words used found in some curated, potentially domain-specific dictionary
+ frequency of words of length > 6
+ sentiment
+ certainty (from self-doubting to absolutely certain)
+ implied causality
+ inclusivity/exclusivity
+ frequency of motion verbs
+ frequency of past tense verbs
+ frequency of future tense verbs
+ word/sentence embeddings (from e.g. bert-base encoder).

In [20]:
# number of words in the sentence
new_df['num_words_sentence'] = new_df['processed_message'].apply(lambda x: len(x.split()))
new_df[['processed_message' , 'num_words_sentence']]

Unnamed: 0,processed_message,num_words_sentence
0,Germany! Just the person I want to speak with...,87
1,"You've whet my appetite, Italy. What's the sug...",8
2,👍,1
3,It seems like there are a lot of ways that cou...,31
4,"Yeah, I can’t say I’ve tried it and it works, ...",127
...,...,...
13127,Is there any way of me actually ending this co...,10
13128,Can we agree on peace? What are your demands?,9
13129,"Neutrality in exchange for current holdings, S...",22
13130,"Thats a bit too much, can I keep Spain and i h...",17


In [21]:
# frequency of functional words in the sentence --> (stop words)

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def count_stop_words(sentence):
    words = sentence.split()
    stop_words_in_sentence = [word for word in words if word.lower() in stop_words]
    return len(stop_words_in_sentence)

new_df['stop_words_count'] = new_df['processed_message'].apply(count_stop_words)
new_df[['processed_message' , 'stop_words_count']]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\766619\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,processed_message,stop_words_count
0,Germany! Just the person I want to speak with...,36
1,"You've whet my appetite, Italy. What's the sug...",3
2,👍,0
3,It seems like there are a lot of ways that cou...,13
4,"Yeah, I can’t say I’ve tried it and it works, ...",58
...,...,...
13127,Is there any way of me actually ending this co...,6
13128,Can we agree on peace? What are your demands?,6
13129,"Neutrality in exchange for current holdings, S...",10
13130,"Thats a bit too much, can I keep Spain and i h...",8


In [22]:
#frequency of certain Parts-of-Speech (PoS) tags in the sentence such as:
# here i get all pos nnot just most commons ones!

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def pos_tag_frequency(sentence):
    words = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    # tags only (not the words), so select the second element in each tuple
    tags = [tag for word, tag in pos_tags]
    tag_freq = Counter(tags)
    return tag_freq

new_df['pos_tag_freq'] = new_df['processed_message'].apply(pos_tag_frequency)
new_df[['processed_message' , 'pos_tag_freq']]

#Noun (NN)
#Verb (VB)
#Adjective(JJ)
#Adverb(RB)
#Preposition (IN)
#Conjunction (CC)
#Pronoun(PRP)
#Interjection (INT)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\766619\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\766619\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,processed_message,pos_tag_freq
0,Germany! Just the person I want to speak with...,"{'NNP': 3, '.': 8, 'DT': 8, 'NN': 11, 'PRP': 1..."
1,"You've whet my appetite, Italy. What's the sug...","{'PRP': 1, 'VBP': 1, 'VB': 1, 'PRP$': 1, 'NN':..."
2,👍,{'NN': 1}
3,It seems like there are a lot of ways that cou...,"{'PRP': 5, 'VBZ': 1, 'IN': 4, 'EX': 1, 'VBP': ..."
4,"Yeah, I can’t say I’ve tried it and it works, ...","{'UH': 1, ',': 7, 'PRP': 25, 'MD': 4, 'VB': 9,..."
...,...,...
13127,Is there any way of me actually ending this co...,"{'VBZ': 1, 'EX': 1, 'DT': 2, 'NN': 2, 'IN': 1,..."
13128,Can we agree on peace? What are your demands?,"{'MD': 1, 'PRP': 1, 'VB': 1, 'IN': 1, 'NN': 1,..."
13129,"Neutrality in exchange for current holdings, S...","{'NNP': 5, 'IN': 4, 'NN': 1, 'JJ': 1, 'NNS': 3..."
13130,"Thats a bit too much, can I keep Spain and i h...","{'VB': 3, 'DT': 2, 'NN': 2, 'RB': 1, 'JJ': 2, ..."


In [25]:
# frequency of certain N-grams in the sentence
def count_ngrams(sentence, n):
    words = word_tokenize(sentence)
    if n == 2:
        n_grams = list(bigrams(words))
    elif n == 3:
        n_grams = list(trigrams(words))
    else:
        return Counter()
    n_gram_freq = Counter(n_grams)
    return n_gram_freq

new_df['bigram_freq'] = new_df['processed_message'].apply(lambda x: count_ngrams(x, 2))
new_df['trigram_freq'] = new_df['processed_message'].apply(lambda x: count_ngrams(x, 3))
new_df[['processed_message' , 'bigram_freq' , 'trigram_freq']]

Unnamed: 0,processed_message,bigram_freq,trigram_freq
0,Germany! Just the person I want to speak with...,"{('Germany', '!'): 1, ('!', 'Just'): 1, ('Just...","{('Germany', '!', 'Just'): 1, ('!', 'Just', 't..."
1,"You've whet my appetite, Italy. What's the sug...","{('You', ''ve'): 1, (''ve', 'whet'): 1, ('whet...","{('You', ''ve', 'whet'): 1, (''ve', 'whet', 'm..."
2,👍,{},{}
3,It seems like there are a lot of ways that cou...,"{('It', 'seems'): 1, ('seems', 'like'): 1, ('l...","{('It', 'seems', 'like'): 1, ('seems', 'like',..."
4,"Yeah, I can’t say I’ve tried it and it works, ...","{('Yeah', ','): 1, (',', 'I'): 2, ('I', 'can')...","{('Yeah', ',', 'I'): 1, (',', 'I', 'can'): 1, ..."
...,...,...,...
13127,Is there any way of me actually ending this co...,"{('Is', 'there'): 1, ('there', 'any'): 1, ('an...","{('Is', 'there', 'any'): 1, ('there', 'any', '..."
13128,Can we agree on peace? What are your demands?,"{('Can', 'we'): 1, ('we', 'agree'): 1, ('agree...","{('Can', 'we', 'agree'): 1, ('we', 'agree', 'o..."
13129,"Neutrality in exchange for current holdings, S...","{('Neutrality', 'in'): 1, ('in', 'exchange'): ...","{('Neutrality', 'in', 'exchange'): 1, ('in', '..."
13130,"Thats a bit too much, can I keep Spain and i h...","{('Thats', 'a'): 1, ('a', 'bit'): 1, ('bit', '...","{('Thats', 'a', 'bit'): 1, ('a', 'bit', 'too')..."


In [28]:
#identify the top N n-grams from text
def get_top_ngram(corpus, n=None, m=10):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]       
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:m]

top_n_bigrams=get_top_ngram(new_df['processed_message'],2)
top_n_bigrams

[('if you', 991),
 ('in the', 906),
 ('going to', 723),
 ('you re', 613),
 ('to be', 538),
 ('want to', 536),
 ('do you', 473),
 ('you can', 423),
 ('we can', 418),
 ('need to', 365)]

In [None]:
# should actually check how many of top n-grams each sentenc contains?

In [43]:
# sentence complexity
# flesch_reading_ease => 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllables / total_words)
new_df['text_ease'] = new_df['processed_message'].apply(lambda x : flesch_reading_ease(x))
new_df[new_df['text_ease']== new_df['text_ease'].max()]

Unnamed: 0,game_id,sender,receiver,message,sender_label,receiver_label,score,processed_message,num_words_sentence,stop_words_count,pos_tag_freq,bigram_freq,trigram_freq,text_complexity,text_ease
2,1,italy,germany,👍,True,True,3,👍,1,0,{'NN': 1},{},{},206.84,206.84
143,1,italy,germany,🤗,True,True,8,🤗,1,0,{'NN': 1},{},{},206.84,206.84
197,1,italy,germany,👍,True,True,10,👍,1,0,{'NN': 1},{},{},206.84,206.84
232,1,italy,germany,👍,True,True,10,👍,1,0,{'NN': 1},{},{},206.84,206.84
349,1,italy,austria,🤗,True,True,5,🤗,1,0,{'NN': 1},{},{},206.84,206.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11957,9,turkey,russia,👍,True,True,4,👍,1,0,{'NN': 1},{},{},206.84,206.84
12455,10,england,italy,http://www.diplomacy-archive.com/resources/str...,True,NOANNOTATION,3,,0,0,{},{},{},206.84,206.84
12697,10,england,germany,?,True,True,6,?,1,0,{'.': 1},{},{},206.84,206.84
12958,10,england,austria,http://www.diplomacy-archive.com/resources/str...,True,True,3,,0,0,{},{},{},206.84,206.84


linguistic complexity is a broad field and there are many other factors could be considered, like the use of passive voice, nominalizations, advanced punctuation, lexical density, and so on.

In [None]:
#frequency of words used found in some curated, potentially domain-specific dictionary
# This is challenging, which domain should be explored??
# wordnet from nltk.corpus might be useful if know which domain we are exploring!

In [45]:
#frequency of words of length > 6
#get the number of words longer than 6 in each message
new_df['num_long_words'] = new_df['processed_message'].str.split().apply(lambda x : [1 if len(i) > 6 else 0 for i in x]).map(lambda x: sum(x))
new_df[['processed_message' , 'num_long_words']]

Unnamed: 0,processed_message,num_long_words
0,Germany! Just the person I want to speak with...,15
1,"You've whet my appetite, Italy. What's the sug...",2
2,👍,0
3,It seems like there are a lot of ways that cou...,7
4,"Yeah, I can’t say I’ve tried it and it works, ...",12
...,...,...
13127,Is there any way of me actually ending this co...,2
13128,Can we agree on peace? What are your demands?,1
13129,"Neutrality in exchange for current holdings, S...",6
13130,"Thats a bit too much, can I keep Spain and i h...",1


In [48]:
#sentiment
# textblob is more sensitive to negative sentiments than nltk
new_df['sentiment'] = new_df['processed_message'].apply(lambda x: TextBlob(x).sentiment.polarity)  
new_df[['processed_message' , 'sentiment']]

Unnamed: 0,processed_message,sentiment
0,Germany! Just the person I want to speak with...,-0.035417
1,"You've whet my appetite, Italy. What's the sug...",0.000000
2,👍,0.000000
3,It seems like there are a lot of ways that cou...,0.700000
4,"Yeah, I can’t say I’ve tried it and it works, ...",0.326190
...,...,...
13127,Is there any way of me actually ending this co...,0.000000
13128,Can we agree on peace? What are your demands?,0.000000
13129,"Neutrality in exchange for current holdings, S...",0.000000
13130,"Thats a bit too much, can I keep Spain and i h...",0.100000


---
#### certainty (from self-doubting to absolutely certain)

This one cannot be implemented simply using nlp tools like nltk and needs ML models eaither fine-tunned pre-traiined ones or those traind on alabled dataset.

In [None]:
# find the certinity verbs among the list of all verbs using chatgpt and then find the frequency of those verbs in the data

In [None]:
# get the list of all verbs in the dataset
nlp = spacy.load("en_core_web_sm")
def get_unique_verbs(texts):
    unique_verbs = set()  #  set to store unique verbs
    for text in texts:
        doc = nlp(text)
        # add the lemmatized form of the word to the set if it is a verb.
        unique_verbs.update(token.lemma_ for token in doc if token.pos_ == "VERB")
    return unique_verbs

verbs = list(get_unique_verbs(new_df['processed_message']))

In [63]:
print(verbs)

['pitch', 'run', 'firm', 'shudder', 'chase', 'like', 'listen', 'track', 'manufacture', 'in', 'coax', 'remove', 'recuse', 'gang', 'sup', 'arise', 'show', 'shrink', 'oppose', 'judge', 'ride', 'voice', 'roll', 'piece', 'evidence', 'to@assure', 'talk', 'station', 'grab', 'fake', 'circle', 'prompt', 'reclaim', 'order', 'fly', 'lower', 'devote', 'signal', 'attempt', 'engage', 'chivalry', 'steal', 'wash', 'discuss', 'bounce', 'entice', 'mow', 'cut', 'giggle', 'regroup', 'chill', 'blush', 'wanna', 'check', 'scream', 'lmk', 'spite', 'mark', 'kinda', 'overrun', 'starve', 'dominate', 'ruh', 'court', 'dupe', 'google', 'ukr', 'descend', 'explode', 'hedgehog', 'strongarm', 'vindicate', 'unguarde', 'switch', 'exist', 'entertain', 'tyn', 'hug', 'hinge', 'interfere', 'claim', 'eat', 'consist', 'committed', 'percieve', 'jiggle', "havn't", 'compensate', 'doing', 'fixate', 'thumb', 'treat', 'estimate', 'last', 'instruct', 'park', 'loan', 'refrain', 'is-', 'sit', 'whet', 'tho', 'paw', 'seek', 'believe', 'u

---
#### implied causality

---
#### inclusivity/exclusivity

---
#### frequency of motion verbs

In [59]:
# extract motion verbs from the list of all verbs using Chat-Gpt
motion_verbs = ['pitch', 'run', 'shudder', 'chase', 'track', 'arise', 'shrink', 'ride', 'roll', 'fly', 'bounce', 'mow', 'cut',
                'regroup', 'chill', 'overrun', 'descend', 'explode', 'switch', 'hug', 'eat', 'jiggle', 'dance', 'park', 'sit', 
                'seek', 'return', 'gallivant', 'transit', 'dive', 'pop', 'bump', 'send', 'steal', 'wash', 'walk', 'swing', 
                'float', 'squeeze', 'scroll', 'stick', 'poke', 'catch', 'evacuate', 'melt', 'raise', 'board', 'pour', 'move', 
                'scramble', 'hang', 'trigger', 'fling', 'crash', 'start', 'dry', 'answer', 'stall', 'peel', 'capture', 'run', 
                'box', 'end', 'look', 'try', 'hover', 'jump', 'acquire', 'tie', 'dip', 'recover', 'explore', 'fish', 'hop',
                'take', 'execute', 'rob', 'launch', 'arrive', 'collect', 'harm', 'do', 'hand', 'gather', 'give', 'split', 
                'trip', 'close', 'part', 'snap', 'bury', 'rush', 'greet', 'weigh', 'swing', 'pay', 'enter', 'put', 'separate',
                'unite', 'fix', 'wrap', 'skate', 'tell', 'lead', 'rescue', 'eject', 'perceive', 'rum', 'loosen', 'welcome',
                'break', 'tread', 'speak', 'freak', 'protect', 'impose', 'explain', 'attend', 'prepare', 'tend', 'upgrade', 
                'advance', 'sneak', 'kick', 'consume', 'continue', 'train', 'drag', 'hint', 'barge', 'volunteer', 'swoop',
                'rotate', 'head', 'live', 'match', 'exit', 'ward', 'string', 'tease', 'side', 'offer', 'pull', 'turn', 'shut', 
                'laugh', 'extract', 'click', 'overcome', 'race', 'grill', 'lay', 'declare', 'ski', 'face', 'snag', 'deceive',
                'survive', 'design', 'spin', 'build', 'discard', 'slam', 'reconsider', 'have', 'pin', 'realize', 'foreclose', 
                'emerge', 'provide', 'picture', 'ascertain', 'pass', 'work', 'cascade', 'emerge', 'contact', 'alter', 'receive',
                'dig', 'interpret', 'mop', 'spread', 'need', 'plan', 'smell', 'hit', 'object', 'block', 'pretend', 'empty', 
                'blow', 'bind', 'stab', 'reason', 'watch', 'standby', 'crave', 'verify', 'corner', "approach","arrive","ascend",
                "backpedal","balance","bounce","climb","crawl","creep","dance","descend","dodge","drag","drift","drive","fall",
                "float","fly","gallop","glide","hop","jump","kick","leap","march","move","paddle","pedal","pull","push","run",
                "sail","scoot","skate","slide","spin","split","sprint","stand","stop","stroll","swim","swing","trot","walk",
                "waltz","wiggle","zoom",]

In [60]:
nlp = spacy.load("en_core_web_sm")
def count_motion_verbs(text):
    doc = nlp(text)
    # count motion verbs
    count = sum(1 for token in doc if token.pos_ == "VERB" and token.lemma_ in motion_verbs)
    return count

new_df['motion_verb_count'] = new_df['processed_message'].apply(count_motion_verbs)

In [62]:
new_df[['processed_message', 'motion_verb_count']]

Unnamed: 0,processed_message,motion_verb_count
0,Germany! Just the person I want to speak with...,4
1,"You've whet my appetite, Italy. What's the sug...",0
2,👍,0
3,It seems like there are a lot of ways that cou...,3
4,"Yeah, I can’t say I’ve tried it and it works, ...",11
...,...,...
13127,Is there any way of me actually ending this co...,1
13128,Can we agree on peace? What are your demands?,0
13129,"Neutrality in exchange for current holdings, S...",0
13130,"Thats a bit too much, can I keep Spain and i h...",0


---
#### frequency of past tense verbs

In [66]:
nltk.download('averaged_perceptron_tagger')
def get_past_tense_verbs(text):
    word_tokens = word_tokenize(text)
    tagged = nltk.pos_tag(word_tokens)
    past_tense_verbs = [word for word, pos in tagged if pos == 'VBD']
    return len(past_tense_verbs)

new_df['num_past_tense'] = new_df['processed_message'].apply(get_past_tense_verbs)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\766619\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [67]:
new_df[['processed_message', 'num_past_tense']]

Unnamed: 0,processed_message,num_past_tense
0,Germany! Just the person I want to speak with...,2
1,"You've whet my appetite, Italy. What's the sug...",0
2,👍,0
3,It seems like there are a lot of ways that cou...,0
4,"Yeah, I can’t say I’ve tried it and it works, ...",3
...,...,...
13127,Is there any way of me actually ending this co...,0
13128,Can we agree on peace? What are your demands?,0
13129,"Neutrality in exchange for current holdings, S...",0
13130,"Thats a bit too much, can I keep Spain and i h...",0


---
#### frequency of future tense verbs

In [68]:
# Load the English language model
nlp = spacy.load('en_core_web_sm')

In [73]:
def get_future_tense_verbs(text):
    doc = nlp(text)
    future_tense_verbs = []
    for token in doc:
        # check if the token is not the first or the last in the doc
        if token.i > 0 and token.i < len(doc) - 1:
            # handling the case for "will" and "shall"
            if token.lower_ in ["will", "shall"]:
                next_token = token.nbor()  # The token after "will" or "shall"
                if next_token.pos_ == "VERB":
                    future_tense_verbs.append(next_token.text)
            # handling the case for "is going to"
            elif token.lower_ == "going" and token.nbor(-1).lower_ == "is" and token.nbor(1).lower_ == "to":
                next_token = token.nbor(1)  # the token after "going", expected "to"
                next_next_token = next_token.nbor()  # the token after "to"
                if next_next_token.pos_ == "VERB":
                    future_tense_verbs.append(next_next_token.text)
    # for the case where "will" or "shall" is the last word in a sentence
    for sent in doc.sents:
        last_word = sent[-1]
        if last_word.lower_ in ["will", "shall"]:
            # handle cases or add context-specific actions
            pass  # or add to list if contextually correct in sentences like "Tomorrow, I will."
    return len(future_tense_verbs)
 

new_df['num_future_tense'] = new_df['processed_message'].apply(get_future_tense_verbs)

In [74]:
new_df[['processed_message', 'num_future_tense']]

Unnamed: 0,processed_message,num_future_tense
0,Germany! Just the person I want to speak with...,0
1,"You've whet my appetite, Italy. What's the sug...",0
2,👍,0
3,It seems like there are a lot of ways that cou...,0
4,"Yeah, I can’t say I’ve tried it and it works, ...",0
...,...,...
13127,Is there any way of me actually ending this co...,0
13128,Can we agree on peace? What are your demands?,0
13129,"Neutrality in exchange for current holdings, S...",0
13130,"Thats a bit too much, can I keep Spain and i h...",0


---
#### word/sentence embeddings (from e.g. bert-base encoder).