In [None]:
%matplotlib widget

In [None]:
#!jupyter nbextension enable --py widgetsnbextension

In [None]:
# 09 April 2023
# nrobot
# Run the acornym expansion on the full dataset!

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import os

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import gensim
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_short, remove_stopwords, strip_multiple_whitespaces
from nltk.corpus import stopwords

import gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import Word2Vec

from nltk import PorterStemmer

In [None]:
columns_page_info = ['time_downloaded', 'author', 'posted_date_readable',  'post_ordinal', 'thread_page_name', 'thread_page_num', 'thread_page_url', 'post_text']

columns_thread_info = ['src_category_name', 'thread_page_name', 'thread_page_num', 'thread_max_pages', 'thread_page_url']

columns_likes = ['num_likers', 'likers']
columns_quotes = ['num_quotes', 'quoted_post_ids', 'quoted_authors', 'quoted_contents']
columns_authors = ['author', 'author_title', 'author_num_posts', 'author_num_reviews', 'author_url', 'join_date_readable', 'join_date_data']

In [None]:
def load_data(infile='list_of_post_contents.csv', nrows=None):
    infile='list_of_post_contents.csv'

    df = pd.read_csv(Path(os.getcwd(), 'nogit_data', infile), nrows=nrows)
    print(f'{df.columns=}')
    print(f'{df.shape=}')

    df.dropna(subset=['post_text'], inplace=True)
    print(f'{df.shape=}')
    df.drop_duplicates(subset=['post_text'], inplace=True)
    print(f'{df.shape=}')

    print(f'{columns_page_info=}')
    df = df[columns_page_info]
    df['posted_date_datetime'] = df.posted_date_readable.parallel_apply(
        lambda x: pd.to_datetime(x))
    return df

def get_discussions_only(df):
    # remove posts that come from reviews (vs. discussions)
    discussions = df[df.src_category_name.str.contains('Discussion')]
    # reformat 1,000 to 1000
    if discussions.author_num_posts.dtype != int:
        discussions.author_num_posts = discussions.author_num_posts.apply(lambda x: x.replace(',', ''))
        discussions.author_num_posts = discussions.author_num_posts.astype(int)
    return discussions

# preprocess

In [None]:
def nltk_preprocess(df):

    my_stopwords = stopwords.words('english')
    print(my_stopwords)
    my_stopwords.extend([s.title() for s in my_stopwords])
    print(f'{my_stopwords=}')
    print(f'{df.columns=}')
    porter = PorterStemmer()

    CUSTOM_FILTERS = [
        strip_tags, strip_punctuation, 
        lambda x: strip_short(x, minsize=2),  # remove only 1 letter words 
        lambda y: remove_stopwords(y, stopwords=my_stopwords),
        lambda z: porter.stem(z, to_lowercase=False )
    ]

    df['preprocessed_posts'] = df['post_text'].parallel_apply(
        lambda x: preprocess_string(x, CUSTOM_FILTERS)) 
    return df

    #stop_nltk.extend([s.title() for s in stop_nltk])

In [None]:
my_stopwords = stopwords.words('english')
my_stopwords.extend([s.title() for s in my_stopwords])
porter = PorterStemmer()
#snowball = SnowballStemmer()

CUSTOM_FILTERS = [
    strip_tags, strip_punctuation, 
    lambda x: strip_short(x, minsize=2),  # remove only 1 letter words 
    lambda y: remove_stopwords(y, stopwords=my_stopwords),
    lambda z: porter.stem(z, to_lowercase=False )
]
preprocess_string('marry married marriage Marrying Marriage Married Marry', CUSTOM_FILTERS)

In [None]:
def create_bigram_corpus(df, min_count=2, threshold=10): # TODO: consider taking in a phraser model directly, instead of params to pass to one
    posts = df.preprocessed_posts.to_list()
    my_phrases = gensim.models.Phrases(posts, min_count=2, threshold=threshold)
    bigram_ifier = Phraser(my_phrases)

    df['bigrammed_posts'] = df['preprocessed_posts'].parallel_apply(
        lambda post: bigram_ifier[post]) 

    bigrammed_corpus = df.bigrammed_posts.to_list()
    print(f'Created word vectors for corpus size {len(bigrammed_corpus)=}, '
          f'example post {bigrammed_corpus[0]=}')
    return bigrammed_corpus
    
def create_word2vec(corpus, overwrite=False, outfile='nogit_data/Case_1/word2vec.bigrams.model', seed=None):
    model = Word2Vec(corpus, seed=seed, workers=10)

    if overwrite:
        # Open "path" for writing, creating any parent directories as needed.
        # TODO: catch exception where parent folder doesn't exist; or 
        # force it to exist by committing to git (normally nogit_data ignored by git)
        # TODO os.makedirs(os.path.dirname(outfile), exist_ok=True)
        outfile = Path(outfile)
        if seed:
            # add in seed to filename
            path, file, ext = outfile.parent, outfile.stem, outfile.suffix
            outfile = Path(path, f'{file}_seed-{seed}{ext}')
        print(f'\-- {overwrite=}, saved model to {outfile=}')
        model.save(str(outfile))
    return model

In [None]:
%%time 
df = load_data()#nrows=100000)

In [None]:
%%time
df
#df = load_data(nrows=1000)
#df = load_data(nrows=10000)
#df = load_data(None)d


In [None]:
%%time 
df = nltk_preprocess(df)

In [None]:
!mkdir nogit_data/Case_1

In [None]:
%%time 
df.to_pickle(f'nogit_data/Case_1/df.pd_{pd.__version__}.pkl')

In [None]:
%%time 
bigrammed_corpus = create_bigram_corpus(df)

In [None]:
df.columns

In [None]:
%%time
seeds = [1,42,100,12345,888]
models = []

for seed in seeds:
    print(f'{"-"* 40}')
    print(f'{seed=}\n')
    
    model = create_word2vec(df.preprocessed_posts, overwrite=False, seed=seed)
    models.append(model)

In [None]:
import matplotlib
matplotlib.use('module://ipympl.backend_nbagg')
%matplotlib ipympl

In [None]:
# for non brigram 
results = {}

for idx in range(5):
    model = models[idx]
    words = ['worries', 'worrying', 'guilty',  'family', 'Wife', 'SO','partner', 'married',  'LEO', 'law', 'police', 'trafficking', 'arrest']
    for query in words:
        sims = model.wv.most_similar(query, topn=10)  # get other similar words
        #print(f'{query=}\t ' , ', '.join([ f'{word} = {vector:.2f}' for word, vector in sims]))
        results[query] = [ f'{word} = {vector:.2f}' for word, vector in sims]
        #print('-')
    
    display(pd.DataFrame(results))

In [None]:
model = models[3]
for query in ['paper', 'laptop', 'plant']:#, 'MMS', 'HJ', 'HE', 'BJ', 'full_menu']:
    sims = model.wv.most_similar(query, topn=30)  # get other similar words
    print(f'{query=}\t ' , ', '.join([ f'{word} = {vector:.2f}' for word, vector in sims]))
    print('-')

In [None]:
----> model seed 42 --->
         word 1    word 2
word 1   top    resuls    
word 2   second similiar

so... that's ....
for each model... have the word
{model: word1: []
 
or .... 
 
 result 1 - which word - which model
 

In [None]:
i
...

In [None]:
from umap import UMAP


In [None]:
import matplotlib

In [None]:
int(

In [None]:
rng = np.random.default_rng(12345)

embedding_clusters = []
word_clusters = []
keys= ['wife', 'worry', 'SO', 'table', 'parking', 'covid', 'LEO', 'MMS', 'herpes']
# NUMBER IS 10
colors = ['red', 'blue', 'gold', 'purple', 'pink', 'gray', 'teal', 'orange', 'hotpink']

SEED = 88
WORDS = 8

for word in keys:
    print(f"Key = {word}")
    embeddings = [model.wv[word]]
    words = [word]
    for similar_word, _ in model.wv.most_similar(word, topn=WORDS):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)
    
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
print(n, m, k)
#tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32, n_jobs=-1)
umap_2d = UMAP(n_components=2, init='spectral', random_state=SEED)
# his means that low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture), while large values will push UMAP to look at larger neighborhoods of each point wh, default 10

#umap_3d = UMAP(n_components=3, init='spectral', random_state=0)
embeddings_en_2d = np.array(umap_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
#embeddings_en_3d = np.array(umap_3d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 3)

def legend_without_duplicate_labels(ax):
    handles, labels = ax.get_legend_handles_labels()
    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
    ax.legend(*zip(*unique))

# hack - why are embeddings a bit different ... 
def umap_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    figsize = (9.5,6) if (matplotlib.get_backend() == 'nbAgg') else (12,12)  # interactive plot should be smaller
    #figsize = (9.5,6)
    fig = plt.figure(figsize=(figsize))
    #ax = fig.add_subplot(projection='3d')

    #colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    #colors = matplotlib.colormaps['PiYG'](np.linspace(0, 1, len(labels)))
    #colors = matplotlib.colormaps['Spectral'](np.linspace(0, 1, len(labels)))
    plotted_words = []
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        #x = embeddings[:, 0]
        #y = embeddings[:, 1]
        #plt.scatter(x, y, c=[color], alpha=a, label=label, edgecolors='gray', s=50)
        for i, word in enumerate(words):
            if word not in plotted_words:
                x = embeddings[i, 0]
                y = embeddings[i, 1]
                plt.scatter(x, y, c=[color], alpha=a, label=label, edgecolors='gray')
                if word =='worry':
                    print(f'{x=}, {y=}')
                sign = -1 if np.random.random() > 0.5 else 1
                plt.annotate(word, alpha=.8, xy=(x, y), xytext=(0,0),#(2 + rng.random()*4, sign*(4 + rng.random()*4)),
                            textcoords='offset points', ha='right', va='bottom', size=10)
                plotted_words.append(word)
    ax = plt.gca()
    legend_without_duplicate_labels(ax)
    #plt.legend(loc=4)"
    plt.title(title)
    plt.ylabel('UMAP 1')
    plt.xlabel('UMAP 2')
    #plt.grid(True)
    plt.show()

umap_plot_similar_words('UMAP Similar words', keys, embeddings_en_2d, word_clusters, 0.8,
                        'similar_words.png')
plt.savefig('umap_similar_words.png')

from itertools import permutations

In [None]:
all_feelings = {}
def get_affinities(feeling, nouns):
    scores = {}
    print()
    for noun in nouns:
        pair = [feeling, noun]
        print('---', ', '.join(pair))
        print('\t\t', f'{model.wv.similarity(*pair):.2f}')
        scores[noun] = f'{model.wv.similarity(*pair):.2f}'
    all_feelings[feeling] = scores
    return scores

nouns = ['SO', 'wife', 'marriage','friends', 'family', 'LEO', 'police', 'COVID', 'herpes', 'C19', 'arrested', 'kids']
get_affinities('worry', nouns)
get_affinities('afraid', nouns)
scores= get_affinities('anxious', nouns)
my_data = pd.DataFrame(all_feelings)

In [None]:
import altair as alt

In [None]:
my_data.reset_index()

In [None]:
my_data.reset_index().sort_values(by='worry', ascending=False)

In [None]:
_domain = nouns
_range = colors + ['gray', 'black', 'hotpink']
print(list(zip(_range, _domain)))
alt.Chart(my_data.reset_index()).mark_bar().encode(
    x=alt.X('index:N').sort('-y'),
    y=alt.Y('worry:Q'),
        color=alt.Color('index').scale(domain=_domain, range=_range)
    
).properties(height=alt.Step(20))

In [None]:
alt.Chart(my_data.reset_index()).mark_bar().encode(
    x=alt.X('index:N').sort('y'),
    y=alt.Y('afraid:Q')
).properties(height=alt.Step(20))

In [None]:
alt.Chart(my_data.reset_index()).mark_bar().encode(
    x=alt.X('index:N').sort('y'),
    y=alt.Y('anxious:Q')
).properties(height=alt.Step(20))

In [None]:
!pip install wordcloud

In [None]:
import wordcloud

In [None]:
df['preprocessed_str'] = df.preprocessed_posts.parallel_apply(lambda x: ' '.join(x))

In [None]:
df.preprocessed_str

In [None]:
from collections import Counter
from wordcloud import WordCloud
word_cloud_dict=Counter(df.preprocessed_str)
wordcloud = WordCloud(width = 800, height = 800,
                    background_color='white',).generate_from_frequencies(word_cloud_dict)


In [None]:
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")

In [None]:
alt.__version__
#!pip install altair==5.0.0rc1

In [None]:
for pair in pair_list:
    print(pair)