In [17]:
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

sys.path.append("../src/utils")

In [18]:
%load_ext autoreload
%autoreload 1
%aimport datasets

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
from general import *

In [20]:
discord_path = '../data/social/discord/'
facebook_paths = ['../data/social/facebook1/',
                  '../data/social/facebook2/']
instagram_path = ['../data/social/instagram/']

In [21]:
def load_all(loader, dir):
    buff = []
    if not os.path.isdir(dir): return buff
    for i in os.listdir(dir):
        buff.append(loader(dir + i))
    return buff
        

chats = []
chats += load_all(datasets.load_discord, discord_path)

for fb in facebook_paths:
    chats += load_all(datasets.load_messenger, fb + 'messages/inbox/')
    chats += load_all(datasets.load_messenger, fb + 'messages/archived_threads/')
    chats += load_all(datasets.load_messenger, fb + 'messages/e2ee_cutover/')
    chats += load_all(datasets.load_messenger, fb + 'messages/filtered_threads/')

original_chats = chats

In [22]:
len(original_chats)

171

In [23]:
datasets.view_excerpt(original_chats[9])

0,1
jasaf3,But Bachelard works with this a bit better
jasaf3,also nope ❤️
patztablook22,elaborate \;)
jasaf3,[image.png]
jasaf3,not that deep
jasaf3,Essentially
jasaf3,That's wrong
jasaf3,because
patztablook22,do me a favour and write the line of reasoning that connects that picture depicting which people in average happen to work in which dsciplines with this
patztablook22,"also fuck, it's 2 AM (BST)"


In [24]:
def info(chats):
    all = [m for chat in chats for m in chat]
    sample = np.random.choice(all, size=1000)
    
    lengths = [len(m.body) for m in all if len(m.body) != 0]
    languages = datasets.get_languages(sample, languages=['en', 'cs', 'sk', 'ru', 'de'], threshold=0)
    users = datasets.get_users(sample, threshold=0.03)
    
    print("Total messages  ", len(all))
    print("Length/message  ", f"mean: {np.mean(lengths):.1f}, quantiles 10-50-90: {np.quantile(lengths, 0.10)} {np.quantile(lengths, 0.50)} {np.quantile(lengths, 0.95)}")
    print("With attachment ", f"{len([m for m in all if m.attachments]) / len(all) * 100:.1f}%")
    print("With reaction   ", f"{len([m for m in all if m.reactions]) / len(all) * 100:.1f}%")
    print("Languages       ", ', '.join([f"{l}: {round(f*100)}%" for l,f in sorted(languages.items(), key=lambda i: -i[1])]))
    print("Users           ", ', '.join([f"{u}: {round(f*100)}%" for u,f in sorted(users.items(), key=lambda i: -i[1])]))

info(original_chats)

Total messages   516948
Length/message   mean: 33.1, quantiles 10-50-90: 3.0 21.0 93.0
With attachment  6.3%
With reaction    14.2%
Languages        en: 66%, cs: 26%, sk: 5%, de: 4%, ru: 0%
Users            Patztablook TwentyTwo: 44%, Marie Holá: 14%, Scanthning Brot: 13%, jasaf3: 9%, patztablook22: 7%


In [25]:
fixed = {'Patztablook TwentyTwo': 'p',
         'patztablook22': 'p',
         'patrik zavoral': 'p',
         'patz': 'p'}

additional = ['Mank', 'Mari', 'Jáš']

def rename(chats):
    all = [m for chat in chats for m in chat]
    users = datasets.get_users_list(all) + additional
    mapper = datasets.make_name_mapper(users, fixed=fixed)
    # print(mapper)
    
    def renamer(chat):
        return [datasets.change_names(m, mapper) for m in chat]
    return lmap(renamer, chats)

renamed_chats = rename(original_chats)

eta 210 s
eta 104 s
eta 51 s
eta 26 s
eta 13 s


In [26]:
def censor(message):
    def c(tag, pattern, replacer=None):
        nonlocal message
        message = datasets.censor(message, pattern, replacer=None, tag=tag)

    c('email', r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    c('url',   r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b|ftp:\/\/[A-Za-z0-9._%+-]+:[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    c('url',   r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&#+~]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    c('bank',  r'(\d{1,6}-)?\d{1,10}/\d{4}')
    c('phone', r'\+?\d[\d -]{8,}\d')
    c('address', ['Třešňová', '914'])
    c('birthday', ['23 dec', 'dec 23', 'december 23', '23 december', '23.', '23/'])
    
    return message

censored_chats = llmap(censor, renamed_chats[:])

eta 80 s
eta 40 s
eta 20 s


In [27]:
i = 1
datasets.view_excerpt(censored_chats[i], where=lambda m: m.issues)

0,1
patrick,koncene poradne prochazim chaoticky systemy a uz ten pojem pobiram 😄
patrick,krasne to souvisi s https://www.youtube.com/watch?v=lveOu7jLNh0
patrick,ktere zmensuji dimensionality
p,"nice, to neznam"
p,tohle mi zni iffy
p,imo obecne psychedelika zpusobuji abnormalni domnenky
p,"casto ""uzaviraji kruhy"" a jine ""oteviraji"""
p,rozdily mezi konkretnimj psychedeliky rozhodne jsou tho
p,btw uz 4-5 mesicu jsem sober
p,a porad mam cas od casu flashbacks na tripy


In [28]:
chats = censored_chats
i = np.random.choice(len(chats))
print(i)
datasets.view_excerpt(chats[i], n=20)

0


0,1
stepsanderse,[Duino_54.m4a]
p,[mari.mp3]
p,[image.png]
p,[image.png]
p,[image.png]
p,[image.png]
p,Chill Beats to Eat Spaghetti Aglio Olio to
stepsanderse,Zapomněl sis tu řasy 😦
p,Neee
p,Přineseš do Prahy?


In [29]:
def enhance_actions(a):
    a = datasets.add_control_actions(a, users=['p'], duration_limit=300, pause_limit=90, count_limit=7)
    a = datasets.mask_actions(a, lambda i: i.user in ['p'])
    return a

actions = lmap(datasets.chat_to_actions, censored_chats)
actions = lmap(enhance_actions, actions)

eta 21 s
eta 10 s


In [30]:
def info(actions):
    all = [a for aa in actions for a in aa]
    masked = lfilter(lambda a: a.data['mask_action'], all)
    msg = lfilter(lambda a: a.type == 'message', masked)
    react = lfilter(lambda a: a.type == 'reaction', masked)
    sample = np.random.choice(msg, size=1000)
    print(set([a.type for a in all]))
    
    lengths = [len(m.data['body']) for m in msg if len(m.data['body']) != 0]
    languages = datasets.get_languages(datasets.actions_to_chat(sample), languages=['en', 'cs', 'sk', 'ru', 'de'], threshold=0)
    # users = datasets.get_users(sample, threshold=0.03)
    
    print("Total actions     ", len(all))
    print("Masked actions    ", len(masked))
    print("Masked messages   ", len(msg))
    print()
    print("Masked messages:")
    print("  Length/message  ", f"mean: {np.mean(lengths):.1f}, quantiles 10-50-90: {np.quantile(lengths, 0.10)} {np.quantile(lengths, 0.50)} {np.quantile(lengths, 0.95)}")
    print("  Languages       ", ', '.join([f"{l}: {round(f*100)}%" for l,f in sorted(languages.items(), key=lambda i: -i[1])]))
    print("  Reactions       ", f"{len(react) / len(msg) * 100:.0f}%")
    # print("Users           ", ', '.join([f"{u}: {round(f*100)}%" for u,f in sorted(users.items(), key=lambda i: -i[1])]))
    

info(actions)

{'message', 'idle', 'reaction'}
Total actions      728634
Masked actions     422696
Masked messages    253254

Masked messages:
  Length/message   mean: 27.7, quantiles 10-50-90: 3.0 19.0 80.0
  Languages        en: 63%, cs: 26%, sk: 6%, de: 4%, ru: 0%
  Reactions        13%


In [31]:
i = np.random.choice(len(actions))
a = actions[i]
j = np.random.choice(len(a))
a = a[j:j+5]
# datasets.simulate(a, sleep=0.1)

for i in a:
    print()
    s, m = datasets.action_to_string(i, return_mask=True)
    datasets.view_masked(s, m)
















In [32]:
actions_path = '../data/actions.pkl'
# pickle good enough, only around 50 MB

import pickle
with open(actions_path, 'wb') as f:
    pickle.dump(actions, f)