In [None]:
#!jupyter nbextension enable --py widgetsnbextension

In [None]:
# 09 April 2023
# nrobot
# Run the acornym expansion on the full dataset!

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import os
import pandas as pd

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import gensim
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_short, remove_stopwords, strip_multiple_whitespaces
from nltk.corpus import stopwords

import gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import Word2Vec

from nltk import PorterStemmer

In [None]:
columns_page_info = ['time_downloaded', 'author', 'posted_date_readable',  'post_ordinal', 'thread_page_name', 'thread_page_num', 'thread_page_url', 'post_text']

columns_thread_info = ['src_category_name', 'thread_page_name', 'thread_page_num', 'thread_max_pages', 'thread_page_url']

columns_likes = ['num_likers', 'likers']
columns_quotes = ['num_quotes', 'quoted_post_ids', 'quoted_authors', 'quoted_contents']
columns_authors = ['author', 'author_title', 'author_num_posts', 'author_num_reviews', 'author_url', 'join_date_readable', 'join_date_data']

In [None]:
def load_data(infile='list_of_post_contents.csv', nrows=None):
    infile='list_of_post_contents.csv'

    df = pd.read_csv(Path(os.getcwd(), 'nogit_data', infile), nrows=nrows)
    print(f'{df.columns=}')
    print(f'{df.shape=}')

    df.dropna(subset=['post_text'], inplace=True)
    print(f'{df.shape=}')
    df.drop_duplicates(subset=['post_text'], inplace=True)
    print(f'{df.shape=}')

    print(f'{columns_page_info=}')
    df = df[columns_page_info]
    df['posted_date_datetime'] = df.posted_date_readable.parallel_apply(
        lambda x: pd.to_datetime(x))
    return df

def get_discussions_only(df):
    # remove posts that come from reviews (vs. discussions)
    discussions = df[df.src_category_name.str.contains('Discussion')]
    # reformat 1,000 to 1000
    if discussions.author_num_posts.dtype != int:
        discussions.author_num_posts = discussions.author_num_posts.apply(lambda x: x.replace(',', ''))
        discussions.author_num_posts = discussions.author_num_posts.astype(int)
    return discussions

# preprocess

In [None]:
def nltk_preprocess(df):

    my_stopwords = stopwords.words('english')
    print(my_stopwords)
    my_stopwords.extend([s.title() for s in my_stopwords])
    print(f'{my_stopwords=}')
    print(f'{df.columns=}')

    porter = PorterStemmer()

    CUSTOM_FILTERS = [
        strip_tags, strip_punctuation, 
        lambda x: strip_short(x, minsize=2),  # remove only 1 letter words 
        lambda y: remove_stopwords(y, stopwords=my_stopwords),
        lambda z: porter.stem(z, to_lowercase=False )
    ]

    df['preprocessed_posts'] = df['post_text'].parallel_apply(
        lambda x: preprocess_string(x, CUSTOM_FILTERS)) 
    return df

def create_word2vec(df, overwrite=False, outfile='nogit_data/Case_1/word2vec.bigrams.model'):

    posts = df.preprocessed_posts.to_list()
    my_phrases = gensim.models.Phrases(posts, min_count=1, threshold=10)
    bigram_ifier = Phraser(my_phrases)

    df['bigrammed_posts'] = df['preprocessed_posts'].parallel_apply(
        lambda post: bigram_ifier[post]) 

    bigrammed_corpus = df.bigrammed_posts.to_list()
    print(f'Creating word vectors for corpus size {len(bigrammed_corpus)=}, '
          f'example post {bigrammed_corpus[0]=}')

    model = Word2Vec(bigrammed_corpus, seed=42, workers=10)

    if overwrite:
            # Open "path" for writing, creating any parent directories as needed.
        # TODO catch exception where parent folder doesn't exist; or 
        # force it to exist by committing to git (normally nogit_data ignored by git)
        # os.makedirs(os.path.dirname(outfile), exist_ok=True)
        model.save(outfile)
        print(f'\-- {overwrite=}, Saved model to {outfile=}')
    return df, model

In [None]:
df = load_data()#nrows=100000)

In [None]:
#df = load_data(nrows=1000)
#df = load_data(nrows=10000)
#df = load_data(None)
df = nltk_preprocess(df)
df, model = create_word2vec(df)

# query

In [None]:
for query in ['SO']:#, 'MMS', 'HJ', 'HE', 'BJ', 'full_menu']:
  sims = model.wv.most_similar(query, topn=20)  # get other similar words
  #for item in sims:
  print(f'{query=}\t ' , ', '.join([ f'{word} = {vector:.2f}' for word, vector in sims]))
  print('-')

def find_abbreviation(query, model):
    similar_words = model.wv.most_similar(query, topn=50)  # get other similar words
    phrases = [word for word, vector in similar_words if '_' in word]
    for phrase in phrases:
        words = phrase.split('_')
        inits = [w[0] for w in words]
        candidate = ''.join(inits).upper()
        if query.upper() == candidate:
            print(f'{query=}, {phrase=}, Candidate: {phrase.replace("_", " ")}')
            return
    print(f'{query=}, no candidate found')

find_abbreviation('TS', model)
find_abbreviation('ST', model)
find_abbreviation('BJ', model)
find_abbreviation('HJ', model)
find_abbreviation('FS', model)
find_abbreviation('SO', model)

# test