In [97]:
!pip install unidecode
!pip install emojis
!pip install spacy



In [169]:
import os, json
import pandas as pd
from pathlib import Path
from multiprocessing import Pool
import re
from functools import partial
import emojis
from unidecode import unidecode
from spacy.lang.pl import Polish

path_to_json = "../data/messages/inbox/"
directories = [pos_json for pos_json in os.listdir(path_to_json)]


In [170]:
SENDER_NAME = "Kacper Trębacz"

In [148]:
def get_emo_list(text):
    return [e for e in emojis.iter(text)]


In [149]:
fix_mojibake_escapes = partial(
     re.compile(rb'\\u00([\da-f]{2})').sub,
     lambda m: bytes.fromhex(m.group(1).decode()))

def do_dir_emo(i):
    json_files = [pos_json for pos_json in os.listdir(os.path.join(path_to_json, i)) if pos_json.endswith('.json')]
    jsons_data = pd.DataFrame(columns=['messages'])
    entries = []
    for index, js in enumerate(json_files):
        path = Path(os.path.join(path_to_json, i, js))
        with open(str(path), 'rb') as binary_data:
            repaired = fix_mojibake_escapes(binary_data.read())
        data = json.loads(repaired.decode('utf8'))
        for msg in data["messages"]:
            if "content" in msg:
                for emoji in get_emo_list(msg["content"]):
                    entries.append({"Emoji":emoji, "Timestamp":msg["timestamp_ms"]})
    df = pd.DataFrame(entries)
    return df

In [150]:
with Pool(12) as pool:
    arr = [(i) for i in directories]
    results = pool.map(do_dir_emo, arr)

In [151]:
emoji_result = pd.concat(results).reset_index()

In [152]:
emoji_result.to_csv(f"{SENDER_NAME}_emojis.csv")

In [171]:
nlp = Polish()
def do_dir_words(i):
    json_files = [pos_json for pos_json in os.listdir(os.path.join(path_to_json, i)) if pos_json.endswith('.json')]
    jsons_data = pd.DataFrame(columns=['messages'])
    entries = {}
    for index, js in enumerate(json_files):
        path = Path(os.path.join(path_to_json, i, js))
        with open(str(path), 'rb') as binary_data:
            repaired = fix_mojibake_escapes(binary_data.read())
        data = json.loads(repaired.decode('utf8'))
        for msg in data["messages"]:
            if "content" in msg and msg["sender_name"] == SENDER_NAME:
                text = unidecode(msg["content"])
                doc = nlp(text)
                words = [token.text.lower() for token in doc if token.text.isalpha()]
                for before,after in zip(words[:-1],words[1:]):
                    if not before in entries:
                        entries[before] ={}
                    if not after in entries[before]:
                        entries[before][after] = 0
                    entries[before][after] += 1
    return entries

In [172]:
with Pool(12) as pool:
    arr = [(i) for i in directories]
    
    results = pool.map(do_dir_words, arr)

In [173]:
final = {}
for res in results:
    for before, before_dict in res.items():
        if not before in final:
            final[before] = {}
        final_before = final[before]
        for after in before_dict:
            if not after in final_before:
                final_before[after] = 0
            final_before[after] += before_dict[after];
for before, before_dict in final.items():
    before_sum = sum(before_dict.values())
    for after in before_dict:
        before_dict[after] /= before_sum

In [174]:
entries = []
for before, before_dict in final.items():
    for after, value in before_dict.items():
        entries.append({"Before":before, "After":after, "Probability": value})

In [175]:
word_order = pd.DataFrame(entries)
word_order

Unnamed: 0,Before,After,Probability
0,a,przez,0.000684
1,a,zmiescilaby,0.000137
2,a,czemu,0.006295
3,a,masz,0.014096
4,a,nawet,0.002463
...,...,...,...
146622,zaby,idziemy,1.000000
146623,gelar,to,1.000000
146624,potwierdz,zaproszenie,1.000000
146625,wpisalas,dane,1.000000


In [168]:
word_order.to_csv(f"{SENDER_NAME}WordOrder.csv")