In [7]:
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import collections

sys.path.append("../src/")
sys.path.append("../src/utils")

In [8]:
%load_ext autoreload
%autoreload 1
%aimport data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from general import *

In [10]:
discord_path = '../data/social/discord/'
facebook_paths = ['../data/social/facebook1/',
                  '../data/social/facebook2/']
instagram_path = ['../data/social/instagram/']

In [11]:
def load_all(loader, dir):
    buff = []
    if not os.path.isdir(dir): return buff
    for i in os.listdir(dir):
        buff.append(loader(dir + i))
    return buff
        

chats = []
chats += load_all(data.load_discord, discord_path)

for fb in facebook_paths:
    chats += load_all(data.load_messenger, fb + 'messages/inbox/')
    chats += load_all(data.load_messenger, fb + 'messages/archived_threads/')
    chats += load_all(data.load_messenger, fb + 'messages/e2ee_cutover/')
    chats += load_all(data.load_messenger, fb + 'messages/filtered_threads/')

original_chats = chats

In [12]:
len(original_chats)

171

In [13]:
data.view_excerpt(original_chats[9])

0,1
jasaf3,source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5332116/
jasaf3,https://sciencebasedlife.wordpress.com/2012/03/30/why-do-witches-ride-broomsticks/
jasaf3,[image.png]  ❤️
patztablook22,based
patztablook22,we both know mathematicians are the divine race supreme 👍
jasaf3,yes
jasaf3,this conversation is actually
jasaf3,about some dude putting maths under philosophy
jasaf3,and me disagreeing
jasaf3,[image.png]  ❤️


In [14]:
def info(chats):
    all = [m for chat in chats for m in chat]
    sample = np.random.choice(all, size=1000)
    
    lengths = [len(m.body) for m in all if len(m.body) != 0]
    languages = collections.Counter(filter(lambda l: l is not None, data.get_languages(sample, languages=['en', 'cs', 'sk', 'ru', 'de'], threshold=0)))
    users = collections.Counter(map(lambda m: m.user, all))
    
    print("Total messages  ", len(all))
    print("Length/message  ", f"mean: {np.mean(lengths):.1f}, quantiles 10-50-90: {np.quantile(lengths, 0.10)} {np.quantile(lengths, 0.50)} {np.quantile(lengths, 0.95)}")
    print("With attachment ", f"{len([m for m in all if m.attachments]) / len(all) * 100:.1f}%")
    print("With reaction   ", f"{len([m for m in all if m.reactions]) / len(all) * 100:.1f}%")
    print("Languages       ", ', '.join([f"{l}: {round(f/languages.total()*100)}%" for l,f in languages.most_common()]))
    print("Users           ", ', '.join([f"{u}: {round(f/users.total()*100)}%" for u,f in users.most_common(8)]))

info(original_chats)

Total messages   516948
Length/message   mean: 33.1, quantiles 10-50-90: 3.0 21.0 93.0
With attachment  6.3%
With reaction    14.2%
Languages        en: 65%, cs: 26%, sk: 6%, de: 2%, ru: 0%
Users            Patztablook TwentyTwo: 43%, Scanthning Brot: 14%, Marie Holá: 13%, jasaf3: 9%, patztablook22: 6%, Drew Mac: 3%, Jan Zasadil: 2%, : 2%


In [16]:
agents = [{'Patztablook TwentyTwo', 'patztablook22', 'patrik zavoral', 'patz'},
          {'Scanthning Brot', 'jasaf3'},
          {'Drew Mac'}]

additional = ['Mank', 'Mari', 'Jáš', 'Sběratel']

def rename(chats):
    all = [m for chat in chats for m in chat]
    users = data.get_users(all) + additional

    result = []
    for chat in chats:
        users = data.get_users(chat)
        for aa in agents:
            relevant = len(aa.intersection(users)) > 0
            if not relevant: continue
            mapper = data.make_name_mapper(users, fixed={agent: 'p' for agent in aa})
            result.append([data.change_names(m, mapper) for m in chat])
            if False:
                print('  ', 'users ', users)
                print('  ', 'agents', aa)
                print('=>', 'users ', data.get_users(result[-1]))
                print()
            
    return result

renamed_chats = rename(original_chats)

In [17]:
def censor(message):
    def c(tag, pattern, replacer=None):
        nonlocal message
        message = data.censor(message, pattern, replacer=None, tag=tag)

    def blacklist(pattern):
        nonlocal message
        message = data.censor(message, fr'(?s).*{pattern}.*', replacer=None, tag=pattern)

    c('email', r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    c('url',   r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b|ftp:\/\/[A-Za-z0-9._%+-]+:[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    c('url',   r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&#+~]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    c('bank',  r'(\d{1,6}-)?\d{1,10}/\d{4}')
    c('phone', r'\+?\d[\d -]{8,}\d')
    c('address', ['Třešňová', '914'])
    c('birthday', ['23 dec', 'dec 23', 'december 23', '23 december', '23.', '23/'])

    blacklist('satisfied with your genes')
    blacklist('satisfied with ur genes')
    blacklist('se svymi geny')
    blacklist('se svými geny')
    blacklist('popsal by ses')
    blacklist('would you describe urself')
    blacklist('left the group.')
    blacklist('changed the group photo.')
    blacklist('created the group.')
    blacklist('started a call')
    blacklist('the call ended')
    blacklist('joined the call')
    
    return message

censored_chats = llmap(censor, renamed_chats[:])

eta 1052 s
eta 512 s
eta 253 s
eta 126 s
eta 61 s
eta 30 s
eta 14 s


In [18]:
i = range(len(censored_chats))
i = 3
if isinstance(i, int): i = [i]
for j in i:
    data.view_excerpt(censored_chats[j], 
                      # where=lambda m: 'would you describe urself' in m.body
                      where=lambda m: m.issues
                      # where=lambda m: 'group' in m.body
                      )

0,1
lori,Do you see botz channel?
p,https://drive.google.com/file/d/1QHHjEAih13eGxsGOsvI4h637zp1XefNz/view
lori,https://tenor.com/view/epic-sax-guy-dance-thrust-gif-3538569
p,xd
lori,Hodně nice
p,:3
lori,Budu poslouchat místo spotify 😄 😮
lori,Chill af
lori,[unknown.png]
lori,Aspoň mi nehrábne z práce 😄


In [19]:
def censorship_info(chats):
    all = [m for chat in chats for m in chat]
    censorships = []
    for m in all:
        for issue in m.issues:
            if issue.type == 'censor': censorships.append(issue.data['tag'])

    counter = collections.Counter(censorships)
    print("Censorship rate:\n" + f"{round(len(censorships) / len(all), 3):.3f}")
    print()
    print("Censorship tags:\n" + ', '.join([f"{tag}: {round(100*count/counter.total(), 2)}%" for tag, count in counter.most_common(20)]))

censorship_info(censored_chats)

Censorship rate:
0.030

Censorship tags:
url: 96.58%, phone: 1.6%, bank: 0.41%, left the group.: 0.37%, birthday: 0.25%, email: 0.18%, changed the group photo.: 0.14%, started a call: 0.13%, would you describe urself: 0.09%, satisfied with ur genes: 0.09%, created the group.: 0.06%, address: 0.03%, joined the call: 0.02%, the call ended: 0.02%, se svymi geny: 0.01%, popsal by ses: 0.0%


In [21]:
agents = ['p']

def enhance_actions(a):
    a = data.add_control_actions(a, agents=agents, duration_limit=300, pause_limit=90, count_limit=7, idle_rate=1)
    a = data.mask_actions(a, lambda i: i.user not in agents)
    a = data.mask_actions(a, lambda i: i.type == 'attachment')
    return a

actions = lmap(data.chat_to_actions, censored_chats)
actions = lmap(enhance_actions, actions)

eta 55 s
eta 27 s
eta 13 s


In [42]:
def info(actions):
    agents = ['p']
    all = [a for aa in actions for a in aa]
    agent = lfilter(lambda a: a.user in agents, all)
    msg = lfilter(lambda a: a.type == 'message', agent)
    react = lfilter(lambda a: a.type == 'reaction', agent)
    sample = np.random.choice(msg, size=1000)
    
    lengths = [len(m.data['body']) for m in msg if len(m.data['body']) != 0]
    languages = collections.Counter(datasets.get_languages(datasets.actions_to_chat(sample), languages=['en', 'cs', 'sk', 'ru', 'de'], threshold=0))
    agent_types = collections.Counter([a.type for a in agent])
    
    print("Total actions       ", len(all))
    print("Agent actions       ", len(agent))
    print()
    print("Agent action types  ", ', '.join([f"{t}: {round(100 * count/agent_types.total())}%" for t, count in agent_types.most_common(len(agent_types))]))
    print()
    print("Agent messages details:")
    print("  Length/message    ", f"mean: {np.mean(lengths):.1f}, quantiles 10-50-90: {np.quantile(lengths, 0.10)} {np.quantile(lengths, 0.50)} {np.quantile(lengths, 0.95)}")
    print("  Languages         ", ', '.join([f"{l}: {round(f/languages.total()*100)}%" for l,f in languages.most_common()]))
    # print("Users           ", ', '.join([f"{u}: {round(f*100)}%" for u,f in sorted(users.items(), key=lambda i: -i[1])]))
    

info(actions)

Total actions        1064757
Agent actions        651609

Agent action types   message: 55%, idle: 34%, reaction: 6%, attachment: 4%

Agent messages details:
  Length/message     mean: 33.9, quantiles 10-50-90: 3.0 21.0 96.0
  Languages          en: 71%, cs: 16%, None: 7%, sk: 4%, de: 2%, ru: 0%


In [89]:
i = np.random.choice(len(actions))
a = actions[i]
j = np.random.choice(len(a))
a = a[j:j+45]
# datasets.simulate(a, sleep=0.1)

for i in a:
    s, m = datasets.action_to_string(i, return_mask=True)
    datasets.view_masked(s, m)

In [90]:
actions_path = '../data/actions.pkl'
# pickle good enough, only around 50 MB

import pickle
with open(actions_path, 'wb') as f:
    pickle.dump(actions, f)