## scraping part

In [None]:
import json
from datetime import datetime
from BAScraper import Pushpull
from os import path

cwd = '../results'
data_path = path.join(cwd, 'data.json')

pp = Pushpull(sleepsec=3, threads=4, cwd=cwd)

data = pp.get_submissions(after=datetime(2024, 1, 1), before=datetime(2024, 1, 2),
                          subreddit='bluearchive', get_comments=True, duplicate_action='keep_original')

with open(data_path, "w", encoding='utf-8') as outfile:
    json.dump(data, outfile, indent=4)

# EDA

In [None]:
import pandas
from collections import Counter
from os import path
import json
from datetime import datetime

cwd = '../results'
data_path = path.join(cwd, 'data.json')
student_path = path.join(cwd, 'students.txt')

exclude_author = ['AutoModerator', 'BlueArchive-ModTeam']

with open(data_path, "r", encoding='utf-8') as outfile:
    data = json.load(outfile)

data_df = pandas.DataFrame.from_dict(data, orient='index')
# [print(col) for col in data_df.columns]
filtered_df = data_df[['title', 'link_flair_text', 'ups', 'created_utc', 'comments', 'num_comments']]
filtered_df.created_utc = filtered_df.created_utc.apply(lambda x: datetime.fromtimestamp(x))
filtered_df.comments = filtered_df.comments.apply(lambda x: [comment['body'] for comment in x if comment['author'] not in exclude_author])
filtered_df.num_comments = filtered_df.comments.apply(lambda x: len(x))
filtered_df = filtered_df.loc[filtered_df['link_flair_text'].isin(['OC ART ', 'NON OC ART ', 'Comic/TL'])]

display(filtered_df.nlargest(10, 'num_comments'))
Counter(filtered_df.link_flair_text)

**Note that the title for the post is appended to the start of the comment doc group**

In [None]:
import re
import html

sub_emote_pattern = r"!\[img\]\(emote\|t5_[a-z0-9]+\|\d+\)"
url_pattern = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"

patterns = [sub_emote_pattern, url_pattern]

def clean(text):
    for pattern in patterns:
        text = re.sub(pattern, '', text)
        text.replace('senseis', 'senseis') # problems with the senseis plural in the model
    return html.unescape(text).lower()

data_text = { i: [v.title] + v.comments for i, v in filtered_df.iterrows()}
data_text = { i: clean('\n'.join(post_group)) for i, post_group in data_text.items()}

data_text['18vokb1']

In [None]:
import spacy
from spacy.tokens import Token
from spacymoji import Emoji

print(spacy.require_gpu())

student_list = list()
with open(student_path, 'r', encoding='utf-8') as f:
    for l in f.readlines():
        student_list.append(l.strip())

# alias groups - (main name, alias-1, alias-2, alias-3, ...)
alias_groups = [
    ('aris', 'alice', 'arisu', 'fridge', 'refrigerator'),
    ('hoshino', 'oji-san', 'ojisan', 'oji san'),
    ('yuuka', 'yuka', 'calculator', '100kg', '100 kg'),
    # ('sensei', 'senseis'), # cannot be detected as plural
]

student_list.extend([j for i in alias_groups for j in i])

print('student_list:', len(student_list))
print(student_list)


def BA_char_finder(tk):
    if (student := tk.text.lower()) in student_list:
        for group in alias_groups:
            if student in group:
                student = group[0]
        return student
    else:
        return None


def get_nbors_(tk: spacy.tokens.token.Token, step: int, N: int):
    # step usually should be 1 or -1 - searching range step
    # N is the Nth search result to be returned

    if tk.is_sent_start and step < 0:
        return
    elif tk.is_sent_end and step > 0:
        return
    
    n = 1
    i = step
    while True:
        nbor = tk.nbor(i)
        if nbor.is_stop or nbor._.is_emoji or not nbor.is_alpha:
            pass
        else:
            if n == N:
                return nbor
            else:
                n += 1
                pass
        
        if nbor.is_sent_end or nbor.is_sent_start:
            return

        i += step


def get_nbors(tk):
    return ((
        (
        get_nbors_(tk, -1, 1), # l1
        get_nbors_(tk, 1, 1), # r1
        ),
        (
        get_nbors_(tk, -1, 2), # l2
        get_nbors_(tk, 1, 2), # r2
        ),
        # tk.sent
    ))

# for splitting sentences
@spacy.language.Language.component("custom_boundary")
def custom_boundary(doc):

    delimiters=['...', '\n']

    for token in doc[:-1]:
        if token.text in delimiters:
            doc[token.i+1].is_sent_start = True
    
    return doc


nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("emoji", first=True)
nlp.add_pipe('custom_boundary', before='parser')

Token.set_extension("nbors", getter=get_nbors, force=True)
Token.set_extension("ba_characters", getter=BA_char_finder, force=True)

analysis = nlp.analyze_pipes(pretty=True)

### planned pipeline

first, use the `set_extention` to set all the student attr. might add it as a custom pipeline. since the students have alternate names, nicknames I should consider that

**for getting co-occurrence matrix for students**
1. iter through the tokens
2. if `student` extension returns a student: start the below, else: continue
3. get the neighboring words (maybe a range of 2 on each side, and give the closer one higher score)
4. if the neighbor token: is a stopword(`is_stop`): pass, is an emoji(`is_emoji`): pass
5. expand the search range until the required number of token(2 on each side) is found
6. lowercase (and probably clean up a bit) the token and add to list with required metadata(such as score)
7. score for each descriptive token is determined using TF-IDF

### stuffs to use

**token**
- get/set_extension
- nbor
- similarity

### testing pipeline

In [None]:
doc = nlp(data_text['18vnj59'])

In [None]:
for i, tk in enumerate(doc):
    print(f'{tk.lemma_} \t{tk.is_sent_start} / {tk.is_sent_end} - <{i}>')

In [None]:
Counter([(emote[0], emote[2]) for emote in doc._.emoji]).most_common(10)

In [None]:
Counter([tk._.ba_characters for tk in doc if tk._.ba_characters])

In [None]:
[tk._.nbors for tk in doc if tk._.ba_characters == 'sensei']

## scoring (default embedding)


In [None]:
try: del doc
except: pass

from tqdm.notebook import tqdm

docs = list()
for t in tqdm(data_text.values()):
    docs.append(nlp(t))

In [None]:
Counter([tk._.ba_characters for doc in docs for tk in doc if tk._.ba_characters]).most_common(10)

In [None]:
target_character = 'sensei'

# `cm` as in correlation matrix (not exactly a matrix but...)

cm_original = [tk._.nbors for doc in docs for tk in doc if tk._.ba_characters == target_character]
cm_original_1 = [j for i in cm_original for j in i[0] if j] # bigram range
cm_original_2 = [j for i in cm_original for j in i[1] if j] # trigram range

print(Counter([i.text for i in cm_original_1]).most_common(10))
print(Counter([i.text for i in cm_original_2]).most_common(10))

**hard to do vector similarity because `en_core_web_lg` doesn't have vector values for some of the BA_characters**

also, not sure if the vector values will actually represent the character

to note:
>SpaCy's native models, such as en_core_web_lg, the position of a word in a sentence does not affect its vector representation. These models use static word embeddings, where each word is assigned a fixed vector based on the word itself, regardless of its position or the context in which it appears.

so it's okay to de-dupluicate all the words with the same `Token.text` value

In [None]:
targets = [tk for doc in docs for tk in doc if tk._.ba_characters == target_character]
print(Counter([t.text for t in targets]))

# remove duplicate tokens for targets
dedup = {
    target.text: target for target in targets 
}
targets = [t for t in dedup.values()]

for t in targets:
    assert t.has_vector, f"{t.text} doesn't have a vector value"

print('`cm_original` no vector:')
[print(t) for t in cm_original_1 if not t.has_vector]

vec_sim = {target.text: {tk.text: tk.similarity(target) for tk in cm_original_1 if tk.has_vector} for target in tqdm(targets) if target.has_vector}

In [None]:
print([k for k in vec_sim])
print(len(vec_sim))
{k: v for k, v in sorted(vec_sim['sensei'].items(), key=lambda item: item[1], reverse=True)}

# Using FastText to deal with OOV tokens
above scoring and text2vec has limited vocab so has limit in determining the similarity between words, so going to use FastText(unsupervised training) to make word vectors.

1. combine the preprocesed Doc object to a single text file
2. feed that into the FastText
3. replace the vector value of the prev. Doc as the FastText's vector
4. do similarity tests 

## preprocess

In [None]:
def is_cleanable(tk: spacy.tokens.token.Token):
    # might(?) try and add emoji if that seems plausible
    if (tk.is_alpha and not tk.is_stop and not tk._.is_emoji) or tk.is_sent_end:
        return True
    else:
        return False

def preprocess_tk(tk: spacy.tokens.token.Token):
    if not tk.is_alpha:
        tk = '\n'
    elif tk.is_sent_end:
        tk = tk.lemma_.lower().strip()
        tk += '\n'
    else:
        tk = tk.lemma_.lower().strip()
        tk += ' '
    
    return tk


# preprocessed text for FastText
ft_preprocessed = [preprocess_tk(tk) for doc in docs for tk in doc if is_cleanable(tk)]
print(len(ft_preprocessed))
ft_preprocessed = ''.join(ft_preprocessed)
ft_preprocessed = re.sub(" +", " ", ft_preprocessed) # fixes double(or more) spaces
ft_preprocessed = re.sub("<[^>]*>", "", ft_preprocessed) # remove html tags

print(ft_preprocessed.count('\n'))

ft_path = path.join(cwd, 'ft_preprocessed.txt')
with open(ft_path, 'w', encoding='utf-8') as f:
    f.write(ft_preprocessed)

In [None]:
import fasttext
print(ft_path)
model = fasttext.train_unsupervised(ft_path)

In [None]:
model.words

In [None]:
model.get_nearest_neighbors('sensei')