In [None]:
# -*- coding: utf-8 -*-
# %%
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %% [markdown]
# ### Install packages and import
# %%
# #################################### PLEASE INSTALL LATEST CHROME WEBDRIVER #####################################
# Uncomment to run as required
# #     --install-option="--chromedriver-version= *.**" \
#   --install-option="--chromedriver-checksums=4fecc99b066cb1a346035bf022607104,058cd8b7b4b9688507701b5e648fd821"
# %%
# ##### COPY THE LINES IN THIS COMMENT TO THE TOP OF NEW SCRIPTS #####
# # Function to import this package to other files
# import os
# import sys
# from pathlib import Path

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# main_dir = str(Path(code_dir).parents[0])
# scraped_data = f'{code_dir}/scraped_data'
# sys.path.append(code_dir)

# from setup_module.imports import *
# from setup_module.params import *
# from setup_module.scraping import *
# from setup_module.classification import *
# from setup_module.vectorizers_classifiers import *

# warnings.filterwarnings("ignore", category=DeprecationWarning)
# %matplotlib notebook
# %matplotlib inline

# %%
import os
import sys
from pathlib import Path

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

main_dir = str(Path(code_dir).parents[0])
scraped_data = f'{code_dir}/scraped_data'
sys.path.append(code_dir)

from setup_module.imports import *
from setup_module.scraping import *
from setup_module.post_collection_processing import *
from setup_module.params import *
from setup_module.classification import *
# from setup_module.vectorizers_classifiers import *

warnings.filterwarnings("ignore")

%matplotlib notebook
%matplotlib widget
%matplotlib inline

In [None]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [None]:
def build_train_word2vec(
    df, ngram_number, embedding_library, size = 300,
    words = ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer'],
    t = time.time(), cores = multiprocessing.cpu_count(),
):
    sentences = df[f'Job Description {embedding_library}_{ngram_number}grams_original_list'].values

    w2v_model = Word2Vec(
        sentences=sentences,
        vector_size=size,
        min_count=0,
        window=2,
        sample=6e-5,
        alpha=0.03,
        min_alpha=0.0007,
        negative=20,
        workers=cores - 1,
        sg = 1,
    )

    w2v_model.build_vocab(sentences, progress_per=10000)
    print(f'Time to train the model for {size}: {round((time.time() - t) / 60, 2)} mins')

    w2v_model.train(
        sentences,
        total_examples=w2v_model.corpus_count,
        epochs=30,
        report_delay=1,
    )

    print(f'Time to build w2v_vocab for {size}: {round((time.time() - t) / 60, 2)} mins')
    w2v_vocab = list(w2v_model.wv.index_to_key)

    print(f'Checking words form list of length {len(words)}')
    print(f'WORDS LIST: {words}')

    for word in words:
        print(f'Checking word:\n{word.upper()}:')
        try:
            # print(f'{sector} 300: {w2v_model_300.wv[word]}')
            # print(f'{sector} 100: {w2v_model_100.wv[word]}')
            print(f'Length of {size} model vobal: {len(w2v_vocab)}')
            print(f'{size} - Positive most similar to {word}: {w2v_model.wv.most_similar(positive=word, topn=5)}')
            print(f'{size} - Negative most similar to {word}: {w2v_model.wv.most_similar(negative=word, topn=5)}')

        except KeyError as e:
            print(e)

    return w2v_vocab, w2v_model

def word2vec_embeddings(sentences, w2v_vocab, w2v_model, size=300):

    sentences = [word for word in sentences if word in w2v_vocab]

    return np.mean(w2v_model.wv[sentences], axis=0) if len(sentences) >= 1 else np.zeros(size)



In [None]:
def build_train_fasttext(
    df, ngram_number, embedding_library, size = 300,
    words = ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer'],
    t = time.time(), cores = multiprocessing.cpu_count(),
):
    sentences = df[f'Job Description {embedding_library}_{ngram_number}grams_original_list'].values

    ft_model = FastText(
        sentences=sentences,
        vector_size=size,
        min_count=0,
        window=2,
        sample=6e-5,
        alpha=0.03,
        min_alpha=0.0007,
        negative=20,
        workers=cores - 1,
        sg = 1,
    )

    ft_model.build_vocab(sentences, progress_per=10000)
    print(f'Time to train the model for {size}: {round((time.time() - t) / 60, 2)} mins')

    ft_model.train(
        sentences,
        total_examples=ft_model.corpus_count,
        epochs=30,
        report_delay=1,
    )

    print(f'Time to build vocab for {size}: {round((time.time() - t) / 60, 2)} mins')
    ft_vocab = list(ft_model.wv.index_to_key)

    print(f'Checking words form list of length {len(words)}')
    print(f'WORDS LIST: {words}')

    for word in words:
        print(f'Checking word:\n{word.upper()}:')
        try:
            # print(f'{sector} 300: {ft_model_300.wv[word]}')
            # print(f'{sector} 100: {ft_model_100.wv[word]}')
            print(f'Length of {size} model vobal: {len(ft_vocab)}')
            print(f'{size} - Positive most similar to {word}: {ft_model.wv.most_similar(positive=word, topn=5)}')
            print(f'{size} - Negative most similar to {word}: {ft_model.wv.most_similar(negative=word, topn=5)}')

        except KeyError as e:
            print(e)

    return ft_vocab, ft_model

def fasttext_embeddings(sentences, ft_vocab, ft_model, size=300):

    sentences = [word for word in sentences if word in ft_vocab]

    return np.mean(ft_model.wv[sentences], axis=0) if len(sentences) >= 1 else np.zeros(size)


In [None]:
def get_glove(glove_file = f'{llm_path}/gensim/glove/glove.840B.300d.txt'):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf8') as glove:

        for line in glove:
            values = line.split()
            word = values[0]

            try:
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            except ValueError:
                pass

    print(f'Found {len(embeddings_index)} word vectors.')

    return embeddings_index


In [None]:
# Preprocessed unlabeled dataframe
print('Analyzing DF.')
df_all = pd.read_pickle(f'{data_dir}df_manual_for_trainning.pkl')
# n job ads = 21204

# df_name = 'df_manual_mean'
# df_all = pd.read_pickle(f'{df_dir}{df_name}_outliers.{file_save_format}')

try:
    df_all = df_all.drop(
        ['Task_Mentioned', 'Task_Warmth', 'Task_Competence'],
        axis=1,
    )
except KeyError as e:
    print(e)

# df_all = df_all.dropna(subset=dv_cols)
print('DF Processed:')
df_all.info()
df_gender_age_info(df_all)
df_all['English Requirement'].value_counts()
df_all['Dutch Requirement'].value_counts()

In [None]:
df_all.info()

In [None]:
# Load Word2Vec Model
for embedding_library, ngram_number in itertools.product(embedding_libraries_list, ngrams_list):
    for embed_model_name, embed_func_list in embedding_models_dict.items():
        build_train_func, embed_func, model_loader = embed_func_list

        model = model_loader.load(
            validate_path(
                f'{data_dir}embeddings models/{embedding_library}_{ngram_number}grams_{embed_model_name}_model.model'
            )
        )

        setattr(mod, f'model_{embed_model_name}_{ngram_number}grams', model)

# ft_model_gensim = FastText.load(
#     validate_path(
#         f'{args["embeddings_save_path"]}123grams_{embedding_library}_ft_model.model'
#     )
# )

# word2vec_model300 = gensim_api.load('word2vec-google-news-300')
# glove_model300 = gensim_api.load('glove-wiki-gigaword-300')
# fasttext_model300 = gensim_api.load('fasttext-wiki-news-subwords-300')

# word_embedding_models = {'Word2Vec': word2vec_model300, 'GLoVe': glove_model300, 'fastText': fasttext_model300}

embedding_models_dict['w2v'].append(gensim_api.load('word2vec-google-news-300'))
embedding_models_dict['glove'] = [gensim_api.load('glove-wiki-gigaword-300')]
embedding_models_dict['ft'].append(gensim_api.load('fasttext-wiki-news-subwords-300'))

In [None]:
embedding_models_dict.keys()

In [None]:
df_gender_age_info(df_all)
print('='*20)
print('English Requirement:')
df_all['English Requirement'].value_counts()
print('='*20)
print('Dutch Requirement:')
df_all['Dutch Requirement'].value_counts()

In [None]:
def make_sample(df_all, n, sampling_enabled = True, random_state=random_state):
    if sampling_enabled is True:
        df_all_sample = df_all.sample(n=n, random_state=random_state).reset_index(drop=True)
    elif sampling_enabled is False:
        df_all_sample = df_all

    print(f'Sample size: {len(df_all_sample)}')
    # df_all_sample['Search Keyword'].isnull().values.any()
#     df_all_sample.duplicated(subset=["Job ID", 'Job Description_cleaned']).value_counts()

    return df_all_sample

In [None]:
def make_sample_dfs(
    df_all, model_sizes = model_sizes, n=300, sampling_enabled = True,
    dfs_dict = {
        # 'All Sample': {'categories': {'All Sample': defaultdict()}},
        'Gender': {'categories': {'Female': defaultdict(), 'Mixed Gender': defaultdict(), 'Male': defaultdict(), }},
        'Age': {'categories': {'Older': defaultdict(), 'Mixed Age': defaultdict(), 'Younger': defaultdict()}}
    }
):

    df_all_sample = make_sample(df_all, n=n, sampling_enabled=sampling_enabled)

    for gen in order_gender:
        df_gen = df_all_sample.loc[df_all_sample['Gender'] == gen]
        dfs_dict['Gender']['categories'][gen]['df'] = df_gen
        print(f'Length of {gen}: {len(df_gen)}')

    for age in order_age:
        df_age = df_all_sample.loc[df_all_sample['Age'] == age]
        dfs_dict['Age']['categories'][age]['df'] = df_age
        print(f'Length of {age}: {len(df_age)}')

    return dfs_dict

In [None]:
dfs_dict = make_sample_dfs(df_all, sampling_enabled=True)

In [None]:
for iv, iv_cats in dfs_dict.items():
    for iv_cat, value in iv_cats['categories'].items():
        for embedding_library, ngram_number in itertools.product(embedding_libraries_list, ngrams_list):
            value['frequencies'] = defaultdict()
            value['frequencies'][f'{embedding_library}_{ngram_number}grams_abs_word_freq'] = convert_frequency(
                value,
                f'Job Description {embedding_library}_{ngram_number}grams_abs_word_freq'
            )
            value['frequencies'][f'{embedding_library}_{ngram_number}grams_abs_word_perc'] = convert_frequency(
                value,
                f'Job Description {embedding_library}_{ngram_number}grams_abs_word_perc'
            )
            value['frequencies'][f'{embedding_library}_{ngram_number}grams_abs_word_perc_cum'] = convert_frequency(
                value,
                f'Job Description {embedding_library}_{ngram_number}grams_abs_word_perc_cum'
            )
            value['frequencies'][f'{embedding_library}_{ngram_number}grams_abs_word_freq_df'] = pd.DataFrame(
                data=[
                    value['frequencies'][f'{embedding_library}_{ngram_number}grams_abs_word_freq'],
                    value['frequencies'][f'{embedding_library}_{ngram_number}grams_abs_word_perc'],
                    value['frequencies'][f'{embedding_library}_{ngram_number}grams_abs_word_perc_cum']
                ]).T.sort_values(0, ascending=False).rename(columns={0: 'abs_word_freq', 1: 'abs_word_perc', 2: 'abs_word_perc_cum'})

            print('='*80)
            print(f'{iv_cat.upper()}:')
            print(f'{iv_cat} word frequency length: {len(value["frequencies"][f"{embedding_library}_{ngram_number}grams_abs_word_freq_df"])}')
            print(f'{iv_cat} word frequency sorted: {sorted(value["frequencies"][f"{embedding_library}_{ngram_number}grams_abs_word_freq"], key=value["frequencies"][f"{embedding_library}_{ngram_number}grams_abs_word_freq"].get, reverse=True)[:5]}')
            print('-'*20)
            print(f'{iv_cat} {embedding_library} {ngram_number}grams abs_word_freq_df:')
            print('-'*20)
            print(f'{value["frequencies"][f"{embedding_library}_{ngram_number}grams_abs_word_freq_df"].head()}')


In [None]:
print(iv_cats['categories'].items())

In [None]:
ngrams_list

In [None]:
for iv, iv_cats in dfs_dict.items():
    for (iv_cat, value), model_size in itertools.product(iv_cats['categories'].items(), model_sizes):
        for embedding_library, ngram_number in itertools.product(embedding_libraries_list, ngrams_list):
            print(iv_cat.upper())
            print('+'*30)
            value[f'w2v_model_size_{model_size}'] = defaultdict()
            value[f'w2v_model_size_{model_size}']['w2v_vocab'], value[f'w2v_model_size_{model_size}']['w2v_model'] = build_train_word2vec(
                value['df'], size=model_size, ngram_number=ngram_number, embedding_library=embedding_library
            )
            

In [None]:
for iv, iv_cats in dfs_dict.items():
    for (iv_cat, value), model_size in itertools.product(iv_cats['categories'].items(), model_sizes):
        for embedding_library, ngram_number in itertools.product(embedding_libraries_list, ngrams_list):
            print(iv_cat.upper())
            print('+'*30)
            value[f'ft_model_size_{model_size}'] = defaultdict()
            value[f'ft_model_size_{model_size}']['ft_vocab'], value[f'ft_model_size_{model_size}']['ft_model'] = build_train_fasttext(
                value['df'], size=model_size, ngram_number=ngram_number, embedding_library=embedding_library
            )

In [None]:
dfs_dict.keys() #iv
dfs_dict['Gender'].keys() #iv_cat
dfs_dict['Gender']['categories'].keys()
dfs_dict['Gender']['categories']['Female'].keys() #iv_cat
dfs_dict['Gender']['categories']['Female']['df'].keys() #df


In [None]:
for iv, iv_cats in dfs_dict.items():
    sentences_list = []
    for iv_cat, value in iv_cats['categories'].items():
        lst = value['df'][f'{n_gram}'].to_list()
        sentences_list.append(' '.join([sentence for sentences in value['df'][f'{n_gram}'].to_list() for sentence in sentences if sentence]))
        # print(value['df'][f'{n_gram}'].to_list())
        # for df_name, df in value['df'].items():
len(sentences_list)

In [None]:
for iv, iv_cats in dfs_dict.items():

    print(f'Soft Cosine Similarity for {iv}:')

    dfs_dict[f'{iv}']['word_embeddings'] = defaultdict()

    sentence_list = []

    for iv_cat, value in iv_cats['categories'].items():
        sentences_list.append(' '.join([sentence for sentences in value['df'][f'{n_gram}'].to_list() for sentence in sentences if sentence]))

    dictionary = corpora.Dictionary([sentences for sentences in sentences_list if sentences])
    dfs_dict[f'{iv}']['word_embeddings']['dictionary'] = dictionary
    print(f'{iv} Dictionary:\n{dictionary}')

In [None]:
for iv, iv_cats in dfs_dict.items():
    sentences_list = []
    for iv_cat, value in iv_cats['categories'].items():
        lst = value['df'][f'{n_gram}'].to_list()
        sentences_list.append(' '.join([sentence for sentences in value['df'][f'{n_gram}'].to_list() for sentence in sentences if sentence]))
        # print(value['df'][f'{n_gram}'].to_list())
        # for df_name, df in value['df'].items():
len(sentences_list)

In [None]:
for iv, iv_cats in dfs_dict.items():

    print(f'Soft Cosine Similarity for {iv}:')

    dfs_dict[f'{iv}']['word_embeddings'] = defaultdict()

    sentence_list = []

    for iv_cat, value in iv_cats['categories'].items():
        sentences_list.append(' '.join([sentence for sentences in value['df'][f'{n_gram}'].to_list() for sentence in sentences if sentence]))

    dictionary = corpora.Dictionary([sent for sentences in sentences_list for sent in sentences if len(sent) > 0])
    dfs_dict[f'{iv}']['word_embeddings']['dictionary'] = dictionary
    print(f'{iv} Dictionary:\n{dictionary}')

    bow_vectors = [
    dictionary.doc2bow(sent) for sentences in sentences_list for sent in sentences
    ]
    dfs_dict[f'{iv}']['word_embeddings']['bow_vectors'] = bow_vectors
    print(f'{iv} TOP 5 BOW:\n{bow_vectors[0][:5]}')

    tfidf_vectors = TfidfModel(corpus=bow_vectors, dictionary=dictionary)
    dfs_dict[f'{iv}']['word_embeddings']['tfidf_vectors'] = tfidf_vectors

    # Soft Cosine Similarities
    for embed_model_name, embed_func_list in embedding_models_dict.items():
        build_train_func, embed_func, model_loader, model = embed_func_list

        similarity_matrix = SparseTermSimilarityMatrix(WordEmbeddingSimilarityIndex(model), dictionary)
        dfs_dict[f'{iv}']['word_embeddings'][f'{embed_model_name}_similarity_matrix'] = similarity_matrix

        softcos_index = SoftCosineSimilarity(bow_vectors, similarity_matrix, num_best=10)
        dfs_dict[f'{iv}']['word_embeddings'][f'{embed_model_name}_docsim_index'] = softcos_index

        scm1 = similarity_matrix.inner_product(
            bow_vectors[0],
            bow_vectors[1],
            normalized=(True, True)
        )
        scm2 = similarity_matrix.inner_product(
            bow_vectors[0],
            bow_vectors[2],
            normalized=(True, True)
        )
        scm3 = similarity_matrix.inner_product(
            bow_vectors[1],
            bow_vectors[2],
            normalized=(True, True)
        )
        print(f'{model_name} Soft Cosine Similarity between:\n{list(iv_cats["categories"].keys())[0]} Dominated Sectors <-> {list(iv_cats["categories"].keys())[1]} Dominated Sectors: {scm1:.2f}\n{list(iv_cats["categories"].keys())[0]} Dominated Sectors <-> {list(iv_cats["categories"].keys())[2]} Dominated Sectors: {scm2:.2f}\n{list(iv_cats["categories"].keys())[1]} Dominated Sectors <-> {list(iv_cats["categories"].keys())[2]} Dominated Sectors: {scm3:.2f}')

        df_sims = create_soft_cossim_matrix(bow_vectors, list(iv_cats["categories"].keys()), similarity_matrix)
        dfs_dict[f'{iv}']['word_embeddings'][f'{model_name}_df_sims'] = df_sims
        print(f'DF of cosine similarities:\n{df_sims.head()}')


In [None]:
dictionary = corpora.Dictionary([sent for sentences in sentences_list for sent in sentences if len(sent) > 0])

In [None]:
len(dictionary)

In [None]:
bow_vectors = [
    dictionary.doc2bow(sent) for sentences in sentences_list for sent in sentences
    ]

In [None]:
len(bow_vectors)

In [None]:
x = dfs_dict[f'{iv}']['categories'][f'{list(iv_cats["categories"].keys())[0]}']['df'][f'{n_gram}'].to_list()

In [None]:
sent = ''
for token_list in x:
    if token_list:
        for token in token_list:
            sent += ' ' + token


In [None]:
list(sent[:10])

In [None]:
female_all_sentences = [
    f' {token}' for token in token_list for token_list in dfs_dict[f'{iv}']['categories'][f'{list(iv_cats["categories"].keys())[0]}']['df'][f'{n_gram}'].to_list()
]

In [None]:
sentences_list = [
        dfs_dict[f'{iv}']['categories'][f'{list(iv_cats["categories"].keys())[0]}']['df'][f'{n_gram}'].to_list(),
        dfs_dict[f'{iv}']['categories'][f'{list(iv_cats["categories"].keys())[1]}']['df'][f'{n_gram}'].to_list(),
        dfs_dict[f'{iv}']['categories'][f'{list(iv_cats["categories"].keys())[2]}']['df'][f'{n_gram}'].to_list(),
        ]

dictionary = corpora.Dictionary([sent for sentences in sentences_list for sent in sentences if len(sent) > 0])

In [None]:
def pca_sample(sector, size, model, vocab):

    print('\n')
    print(f'PCA for {sector} model {size}')
    pca = PCA(n_components=3, random_state=random_state)
    X = model.wv[vocab]
    pca_clf = pca.fit_transform(X)
    pca_components = pca.components_
    pca_eigenvalues = pca.explained_variance_ratio_
    pca_perc_explained_variance=np.cumsum(np.round(pca_eigenvalues, decimals=3)*100)
    pca_tmp = pd.DataFrame(pca_clf, index=vocab, columns=['x', 'y', 'z'])
    print(pca_tmp.head(3))
    try:
        pca_tmp = pca_tmp.sample(150)
    except ValueError as e:
        print(f'For {sector} model {size}, error: {e}')

    fig = plt.figure(figsize=(15, 15))
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(pca_tmp['x'], pca_tmp['y'], pca_tmp['z'], alpha = 0.5)

    for word, row in pca_tmp.iterrows():
        x, y, z = row
        pos = (x, y, z)
        ax.text(x, y, z, s=word, size=8, zorder=1, color='k')

    plt.title('Word2Vec map - PCA')
    plt.tight_layout()
    plt.show()
    fig.figure.savefig(f'{plot_save_path}w2v_{sector}_{size}_map_PCA.{str(image_save_format)}', format=image_save_format, dpi=3000, bbox_inches='tight')


    return pca_clf, pca_tmp, pca_components, pca_eigenvalues, pca_perc_explained_variance

In [None]:
for iv, iv_cats in dfs_dict.items():
    for (iv_cat, value), model_size in itertools.product(iv_cats['categories'].items(), model_sizes):
        # Word2Vec
        value[f'w2v_model_size_{model_size}']['w2v_pca_clf'], value[f'w2v_model_size_{model_size}']['w2v_pca_tmp'], value[f'w2v_model_size_{model_size}']['w2v_pca_components'], value[f'w2v_model_size_{model_size}']['w2v_pca_eigenvalues'], value[f'w2v_model_size_{model_size}']['w2v_pca_perc_explained_variance'] = pca_sample(iv_cat, model_size, value[f'w2v_model_size_{model_size}']['w2v_model'], value[f'w2v_model_size_{model_size}']['w2v_vocab'])
        # FastText
        value[f'ft_model_size_{model_size}']['ft_pca_clf'], value[f'ft_model_size_{model_size}']['ft_pca_tmp'], value[f'ft_model_size_{model_size}']['ft_pca_components'], value[f'ft_model_size_{model_size}']['ft_pca_eigenvalues'], value[f'ft_model_size_{model_size}']['ft_pca_perc_explained_variance'] = pca_sample(iv_cat, model_size, value[f'ft_model_size_{model_size}']['ft_model'], value[f'ft_model_size_{model_size}']['ft_vocab'])

In [None]:
def tsne_sample(sector, size, model, vocab):

    print(f'TSNE for {sector} model {size}')
    tsne = TSNE(perplexity=40, n_components=3, random_state=random_state, init='pca')
    X = model.wv[vocab]
    tsne_clf = tsne.fit_transform(X)
    tsne_tmp = pd.DataFrame(tsne_clf, index=vocab, columns=['x', 'y', 'z'])
    tsne_tmp['input'] = 0
    tsne_tmp['input'].iloc[0:1] = 1
    print(tsne_tmp.head(3))
    try:
        tsne_tmp = tsne_tmp.sample(150)
    except ValueError as e:
        print(f'For {sector} model {size}, error: {e}')

    # Plotting TSNE
    # 3D Plot
    fig = plt.figure(figsize=(15, 15))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(tsne_tmp[tsne_tmp["input"]==0]['x'],
            tsne_tmp[tsne_tmp["input"]==0]['y'],
            tsne_tmp[tsne_tmp["input"]==0]['z'],
            c='black', alpha = 0.5)
    ax.scatter(tsne_tmp[tsne_tmp["input"]==1]['x'],
            tsne_tmp[tsne_tmp["input"]==1]['y'],
            tsne_tmp[tsne_tmp["input"]==1]['z'],
            c='red', alpha = 0.5)
    ax.set(xlabel=None, ylabel=None, zlabel=None, xticklabels=[],
        yticklabels=[], zticklabels=[])

    for word, row in tsne_tmp[["x","y","z"]].iterrows():
        x, y, z = row
        pos = (x, y, z)
        ax.text(x, y, z, s=word, size=8, zorder=1, color='k')

    plt.title('Word2Vec map - TSNE')
    plt.tight_layout()
    plt.show()
    fig.figure.savefig(
        f'{plot_save_path}w2v_{sector}_{size}_map_TSNE.{image_save_format}', format=image_save_format, dpi=3000, bbox_inches='tight'
    )

    x = []
    y = []
    for value in tsne_clf:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(
            labels[i],
            xy=(x[i], y[i]),
            xytext=(5, 2),
            textcoords="offset points",
            ha="right",
            va="bottom",
        )
    plt.title('Word2Vec plot - TSNE')
    plt.tight_layout()
    plt.show()
    plt.savefig(
        f'{plot_save_path}w2v_{sector}_{size}_plot_TSNE.{image_save_format}', format=image_save_format, dpi=3000, bbox_inches='tight'
    )

    return tsne_clf, tsne_tmp

In [None]:
for iv, iv_cats in dfs_dict.items():
    for (iv_cat, value), model_size in itertools.product(iv_cats['categories'].items(), model_sizes):
        value[f'w2v_model_size_{model_size}']['tsne_clf'], value[f'w2v_model_size_{model_size}']['tsne_tmp'] = tsne_sample(iv_cat, model_size, value[f'w2v_model_size_{model_size}']['w2v_model'], value[f'w2v_model_size_{model_size}']['w2v_vocab'])

In [None]:
def tsne_plot(sector, size, model, words):
    # trained word2vec model dimention
    dim_size = model.wv.vectors.shape[1]

    arrays = np.empty((0, dim_size), dtype='f')
    for word in words:
        try:
            print(f'For {sector} model {size}, analyzing word: {word.title()}')
            word_labels = [word]
            color_list  = ['red']

            # adds the vector of the query word
            arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)

            # gets list of most similar words
            sim_words = model.wv.most_similar(word, topn=10)

            # adds the vector for each of the closest words to the array
            for wrd_score in sim_words:
                wrd_vector = model.wv.__getitem__([wrd_score[0]])
                word_labels.append(wrd_score[0])
                color_list.append('green')
                arrays = np.append(arrays, wrd_vector, axis=0)

            #---------------------- Apply PCA and tsne to reduce dimention --------------

            # fit 2d PCA model to the similar word vectors
            model_pca = PCA(n_components = 10).fit_transform(arrays)

            # Finds 2d coordinates t-SNE
            np.set_printoptions(suppress=True)
            Y = TSNE(n_components=2, random_state=random_state, perplexity=15).fit_transform(model_pca)

            try:
                # Sets everything up to plot
                df_plot = pd.DataFrame({'x': list(Y[:, 0]), 'y': list(Y[:, 1]), 'words_name': word_labels, 'words_color': color_list})


                #------------------------- tsne plot Python -----------------------------------

                # plot dots with color and position
                plot_dot = sns.regplot(data=df_plot,
                                x="x",
                                y="y",
                                fit_reg=False,
                                marker="o",
                                scatter_kws={'s': 40,
                                            'facecolors': df_plot['words_color']
                                            }
                                )

                # Adds annotations with color one by one with a loop
                for line in range(df_plot.shape[0]):
                    plot_dot.text(df_plot["x"][line],
                            df_plot['y'][line],
                            '  ' + df_plot["words_name"][line].title(),
                            horizontalalignment='left',
                            verticalalignment='bottom', size='medium',
                            color=df_plot['words_color'][line],
                            weight='normal'
                            ).set_size(15)


                plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
                plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)

                plt.title(f'{sector} model {size} t-SNE visualization for word {word.title()}')
                plt.tight_layout()
                plt.show()
            except ValueError as e:
                print(f'{sector} model {size} error for {word}: {e}')
        except KeyError as e:
            print(f'{sector} model {size} error for {word}: {e}')

In [None]:
for iv, iv_cats in dfs_dict.items():
    for (iv_cat, value), model_size in itertools.product(iv_cats['categories'].items(), model_sizes):
        tsne_plot(iv_cat, model_size, value[f'w2v_model_size_{model_size}']['w2v_model'], value['frequencies']['abs_word_freq_df'].index.values[:10])

In [None]:
from wordcloud import WordCloud

for iv, iv_cats in dfs_dict.items():
    for iv_cat, value in iv_cats['categories'].items():
        print(f'{iv_cat} model wordcloud:')
        Cloud = WordCloud(colormap='viridis', background_color="white", max_words=50, random_state=random_state).generate_from_frequencies(value['frequencies']['abs_word_freq'])

        plt.imshow(Cloud)
        plt.axis("off")
        plt.show()
        plt.savefig(f'{plot_save_path}w2v_{iv_cat}_wordcloud.{image_save_format}', format=image_save_format, dpi=3000, bbox_inches='tight')

In [None]:
if args['save_enabled'] is True:
    with open(validate_path(f'{args["embeddings_save_path"]}dfs_dict.json'), 'w') as f:
        for key in dfs_dict.keys():
            f.write(f"{key}, {dfs_dict[key]}\n")


Fix below

In [None]:
# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(documents_list)

In [None]:
# Define the number of topics or components
num_components=10

# Create SVD object
lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

# Fit SVD model on data
lsa.fit_transform(train_data)

# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

In [None]:
# Print the topics with their terms
terms = tfidf.get_feature_names()

for index, component in enumerate(lsa.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)