In [None]:
import re
import multiprocessing
from collections import defaultdict

import pandas as pd
import numpy as np 

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale as sk_scale
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

from scipy import stats

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()

from IPython.display import display

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

# Prep nltk library
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

### Setup bare-minimum config

In [None]:
tweets_patterns = [
            r"https?:\\/\\/[a-zA-Z0-9@:%._\\/+~#=?&;-]*",
            r"\\$[a-zA-Z0-9]*",
            r"\\@[a-zA-Z0-9]*",
            r"[^a-zA-Z\\\"]"
        ]

hashtag_patterns = [
            r"\\$[a-zA-Z0-9]*",
            r"[^a-zA-Z\\\"]"
        ]

column_mappings = {
            "date": "date",
            "user_name": "username",
            "retweets": "retweets",
            "text": "tweet",
            "hashtags": "hashtags"
        }

filter_words = [
            "ukraine",
            "russia",
            "zelensky"
        ]

sentiment_map = {
            -1: "negative",
            0: "neutral",
            1: "positive"
        }

## Load in some data and build word embeddings

In [None]:
def load_data():
    
    import_version = 12
    import_dest = "../../data/slava_ukraini_tweets_v{version}.csv".format(version=import_version)
    raw_tweets_df = pd.read_csv(import_dest)


    # Rename columns
    tweets_df = raw_tweets_df[list(column_mappings.keys())].rename(columns=column_mappings)

    # Drop duplicate tweets
    tweets_df = tweets_df.drop_duplicates(subset='tweet', keep='first')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()
    stop_words = stopwords.words("english")

    def cleanTweet(tweet):
        tweet = tweet.lower()
        for pattern in tweets_patterns:
            tweet = re.sub(pattern, ' ', tweet)
        tweet = ' '.join([w for w in tweet.split() if len(w) > 1])
        
        trimmed_lemma_words = [lemma.lemmatize(x) for x in nltk.wordpunct_tokenize(tweet) 
                                    if x not in stop_words]
        clean_tweet = ' '.join(trimmed_lemma_words)
        
        return [lemma.lemmatize(x, nltk.corpus.reader.wordnet.VERB) 
                    for x in nltk.wordpunct_tokenize(clean_tweet) if x not in stop_words]

    def cleanHashtags(hashtags):
        if hashtags:
            hashtags = hashtags.lower()
            for pattern in hashtag_patterns:
                hashtags = re.sub(pattern, ' ', hashtags)
            hashtags = hashtags.strip() 
        return hashtags


    # Clean tweets
    tweets_df['clean_tweet_words'] = tweets_df['tweet'].apply(lambda x: cleanTweet(x))
    tweets_df['clean_tweet'] = tweets_df['clean_tweet_words'].apply(lambda x:' '.join(x))

    # Clean hashtags
    tweets_df["hashtags"] = tweets_df["hashtags"].astype(str)
    tweets_df["hashtags"] = tweets_df["hashtags"].apply(lambda x: cleanHashtags(x))

    # Convert date to datetime and extract month/year
    tweets_df['date'] = pd.to_datetime(tweets_df['date'])
    tweets_df['day'] = tweets_df['date'].dt.day
    tweets_df['month'] = tweets_df['date'].dt.month

    # Remove all tweets which do not have the provided target words
    keywords_str = '|'.join(filter_words)
    filtered_tweets_df = tweets_df.copy()
    filtered_tweets_df = filtered_tweets_df[filtered_tweets_df["clean_tweet"].str.contains(keywords_str)]

    return tweets_df, filtered_tweets_df

In [None]:
def build_word_vectors(full_tweets_df):

    # Restructure the `clean_text` column into list of list of words
    row_sentences = [row for row in full_tweets_df["clean_tweet_words"]]

    # Detect common phrases (bigrams) from a list of sentences
    phrases = Phrases(row_sentences)
    bigram = Phraser(phrases)
    sentences = bigram[row_sentences]
    
    # Initialize vector model
    num_cores = multiprocessing.cpu_count()-1
    word_vec_model = Word2Vec(workers=num_cores, min_count=1)
    
    # Establish dataset for the vector model
    word_vec_model.build_vocab(sentences)

    # Train the model
    word_vec_model.train(sentences, epochs=30, total_examples=word_vec_model.corpus_count)

    word_vectors = word_vec_model.wv

    return word_vectors

In [None]:
CLEAN_TWEETS, FILTERED_TWEETS = load_data()
    
WORD_VECS = build_word_vectors(CLEAN_TWEETS)

## Incorporate same sentiment assignment system (for easy integration)

In [None]:
def setClusterSentiment(vectors, model, sentiment_mapping, version=0, **kwargs):
    display_terms = kwargs.get('display_terms', 20)
    print(f'** Top {display_terms} Similar Word Vectors By Cluster **\n')

    collectSamples(key_vectors=vectors, model=model, version=version)

    ## Get input
    map_string = ', '.join([f'{k} = {v}' for k, v in sentiment_mapping.items()])
    print(f"\nLabel each cluster: {map_string} (\"r\" for new samples, \"q\" to exit)")
    cluster_sentiment_defs = []
    user_input = ''
    batch_number = 0
    valid_sentiment_range = [int(k) for k in sentiment_mapping.keys()]
    while len(cluster_sentiment_defs) < len(sentiment_mapping)-1 and user_input != 'q':
        
        user_input = input(f'Cluster {len(cluster_sentiment_defs)} value:')
        if user_input == 'q':
            import sys
            sys.exit(1)

        if user_input == 'r':
            print(f'\n\nGenerating next {display_terms} samples...\n')
            batch_number += 1
            collectSamples(key_vectors=vectors, model=model, batch=batch_number, version=version)
            print('Current state:', cluster_sentiment_defs)
            print('Setting cluster:', len(cluster_sentiment_defs))
            continue

        try:
            value = int(user_input)
            if value in cluster_sentiment_defs:
                print('Already used this sentiment value.')
                continue
            elif value not in valid_sentiment_range:
                print(f'Value not in provided sentiment mapping: {valid_sentiment_range}')
                continue

            cluster_sentiment_defs.append(value)
            print(f'Set cluster {len(cluster_sentiment_defs)-1} to {value} ({sentiment_mapping[value]})')
        except ValueError:
            print(f'Must input a number in range {valid_sentiment_range}. Press q to exit')
            
    cluster_sentiment_defs.append((set(sentiment_mapping.keys()) - set(cluster_sentiment_defs)).pop())
    print((f'Set cluster {len(cluster_sentiment_defs)-1} to {cluster_sentiment_defs[-1]} ' +
            f'({sentiment_mapping[cluster_sentiment_defs[-1]]})'))
    
    return cluster_sentiment_defs



def collectSamples(key_vectors, model, batch=0, version=0):
    num_clusters = model.cluster_centers_.shape[0]
    if version == 0:
        word_vec_list = [key_vectors.similar_by_vector(model.cluster_centers_[x], 
                                                    topn=(25 * (batch+1)), 
                                                    restrict_vocab=None) 
                                                        for x in range(num_clusters)]
    elif version == 1:
        pca_values = key_vectors['pca_values']
        pca_values = np.stack(pca_values.to_numpy().ravel()).reshape(len(key_vectors), 3)

        word_vec_list = []
        for x in range(num_clusters):
            similarities = cosine_similarity(pca_values, model.cluster_centers_[x].reshape(1, -1)).flatten()

            top_indicies = np.argpartition(similarities, kth=-25, axis=0)[-25:]

            word_vec_list.append(list(zip(list(key_vectors.iloc[top_indicies]['words']), similarities[top_indicies])))


    cluster_values = np.array(list(zip(*[x[(25 * batch):] for x in word_vec_list])))
    cluster_cols = [f'Cluster {x}' for x in range(num_clusters)]

    # # Collect terms spanning multiple clusters for deciphering
    term_freq, counts = np.unique([x[0] for x in np.vstack(cluster_values)], axis=0, return_counts=True)
    unique_terms = term_freq[counts == 1]

    # Separate unique from duplicate terms
    unique_cluster_vals = [[] for _ in range(num_clusters)]
    shared_cluster_vals = defaultdict(lambda : [0] * num_clusters)

    for ix, iy in np.ndindex(cluster_values.shape[:2]):
        term, vec = cluster_values[ix, iy]
        if term in unique_terms:
            unique_cluster_vals[iy].append((term, float(vec)))
        else:
            shared_cluster_vals[term][iy] = float(vec)


    print('Unique Terms from Clusters')
    max_num_unique = max(len(c) for c in unique_cluster_vals)

    # Sort by and drop vector. Even out column lengths
    unique_cluster_terms = np.array([[val[0] for val in sorted(cluster, key=lambda x: x[1])] + 
                                        ['']*(max_num_unique-len(cluster)) # Adjust lengths
                                            for cluster in unique_cluster_vals])

    unique_terms_df = pd.DataFrame(unique_cluster_terms.T, columns=cluster_cols)
    display(unique_terms_df)

    print('\nDuplicate Terms from Clusters')
    if shared_cluster_vals:
        # Build dict for scaling
        shared_vals_df = pd.DataFrame.from_dict(shared_cluster_vals, orient='index', 
                                                    columns=cluster_cols).reset_index()

        display(shared_vals_df)
        # # Calc differences between clusters (for interpretation purposes)
        # for c in cluster_cols[1:]:
        #     shared_vals_df[f'{c} relative to {cluster_cols[0]}'] = shared_vals_df[cluster_cols[0]] - shared_vals_df[c]

        # shared_vals_df = shared_vals_df.drop(cluster_cols[1:], axis=1)

        # scaler = MinMaxScaler(feature_range=(-1, 1))
        # scaled_cols = [c for c in shared_vals_df.columns if c not in ['index', cluster_cols[0]]]

        # scaled_data = scaler.fit_transform(shared_vals_df[scaled_cols])
        # scaled_df = pd.DataFrame(scaled_data, columns=scaled_cols)
        # duplicate_terms_df = pd.merge(shared_vals_df[['index', cluster_cols[0]]], 
        #                                 scaled_df, left_index=True, right_index=True) \
        #                                     .sort_values(by=cluster_cols[0])

        # col_map = { cluster_cols[0]: f'{cluster_cols[0]} (baseline)', 'index': 'term'}
        # duplicate_terms_df = duplicate_terms_df.rename(columns=col_map).set_index('term')
        # display(duplicate_terms_df)
    else:
        print('\tNo duplicates between clusters')

# V1 - Original method

In [None]:
def v1_SentimentGenerator(word_vectors, tweets_df=None):
    cluster_model = KMeans(**{
            "n_clusters": 3,
            "max_iter": 1000,
            "n_init": 50,
            "random_state": 42
        })
    
    cluster_model = cluster_model.fit(X=word_vectors.vectors.astype('double'))

    # cluster_sentiment_defs = setClusterSentiment(word_vectors, cluster_model, sentiment_map, version=0)

    # Create a DataFrame of words with their embeddings and cluster values
    words_df = pd.DataFrame(word_vectors.index_to_key, columns=['words'])
    words_df['vectors'] = words_df.words.apply(lambda x: word_vectors[str(x)])
    words_df['predicted_cluster'] = words_df.vectors.apply(lambda x: cluster_model.predict([np.array(x)]))
    words_df.predicted_cluster = words_df.predicted_cluster.apply(lambda x: x[0])

    # words_df['cluster_value'] = [cluster_sentiment_defs[i] for i in words_df.cluster]

    # # Calculate proximity of words in each vector
    # calc_vector_nearness = lambda x: 1 / (cluster_model.transform([x.vectors]).min())
    # words_df['closeness_score'] = words_df.apply(calc_vector_nearness, axis=1)
    # words_df['sentiment_coeff'] = words_df.closeness_score * words_df.cluster_value

    # # Map sentiment encodings
    # words_df["sentiment"] = words_df["cluster_value"].map(sentiment_map)

    # words_cluster_dict = dict(zip(words_df.words, words_df.cluster_value))    

    # def getSentiment(row):
    #     words_list = row['clean_tweet_words']
    #     total = sum(int(words_cluster_dict.get(word, 0)) for word in words_list)
    #     avg = total / len(words_list)
    #     return -1 if (avg < -0.15) else 1 if (avg > 0.15) else 0

    # # Add sentiment column (integer values)
    # tweets_df["sentiment_val"] = tweets_df.apply(getSentiment, axis=1)
    # # Map integer sentiment to word value
    # tweets_df["sentiment"] = tweets_df["sentiment_val"].map(sentiment_map)


    return words_df, cluster_model

og_cluster_df, og_model = v1_SentimentGenerator(WORD_VECS)
og_cluster_df.head()

# V2 - No more row-by-row `apply()`s

In [None]:

########################## Helper Functions ##########################

# Used for transforming the predicted_cluster groups from the words dataframe
def explodeAndPad(series_group, cluster_dims):
    exploded_arr = pd.DataFrame(series_group.tolist(), series_group.index).to_numpy()
    z = np.zeros((max(cluster_dims), len(cluster_dims)))
    z[0:len(exploded_arr)] = exploded_arr
    return z


# Manipulates the full dataset to possess an axis for cluster value
def generateClusterQueries(cluster_matrix):
    cluster_queries = []
    for i in range(len(cluster_matrix)):
        raw_values = cluster_matrix[i]
        query = raw_values[np.newaxis, :, :]
        cluster_queries.append(query)
    return np.squeeze(np.array(cluster_queries), axis=1)


# Utilizes the euclidean distance algorithm to find the distance between two matricies
def calcDistance(mat_A, mat_B):

    def applyEuclidean3d(subset, axis):
        distance = euclidean3d(mat_A, subset).T[np.newaxis, :, :]
        return np.repeat(distance, 3, 0)
    
    if mat_A.ndim == 3 and mat_B.ndim == 3:
        return np.apply_over_axes(applyEuclidean3d, mat_B, (0,1))

    elif mat_A.ndim == 2 and mat_B.ndim == 2:
        return np.apply_over_axes(
                    ( lambda subset, _: euclidean3d(mat_A[:,None,:], subset).T ), 
                        mat_B, (0, 1))
  
        
# numpy.einsum implementation of to find the euclidean distance between 
# two [i x j x k] matricies
def euclidean3d(mat_A, mat_B):
    subs = mat_A - mat_B
    return np.sqrt(np.einsum('ijk,ijk->ij', subs, subs))


# Applies a scaler to a matrix 
def scale3d(mat_A, scaler=None, restore=False):
    if not scaler:
        scaler = StandardScaler()
    original_shape = list(mat_A.shape)
    flattened_mat = mat_A.reshape(-1, original_shape[len(original_shape)-1])
    scaled_mat = scaler.fit_transform(flattened_mat)
    return scaled_mat if not restore else scaled_mat.reshape(*original_shape)


# Various numpy.einsum implemenations of matrix multiplication
matrixDot2d = lambda mat_A, mat_B: np.einsum('ij,jk->ik', mat_A, mat_B)
matrixDot3d = lambda mat_A, mat_B: np.einsum('ijk,ijl->ikl',mat_A, mat_B)
matrixMult2d = lambda mat_A, mat_B: np.einsum('ij, jk -> ik', mat_A, mat_B)
matrixMult3d = lambda mat_A, mat_B: np.einsum('nmk,nkj->nmj', mat_A, mat_B)
matrixMult3d_2d = lambda mat_A, mat_B: np.einsum('ijk,ik->ijk', mat_A, mat_B)

In [None]:
def v2_SentimentGenerator(word_vectors, tweets_df=None):
    kmeans_params = {
        "n_clusters": 3,
        "max_iter": 400
        # "n_init": 50,
        # "random_state": 42
    }

    # First scale data

    X_scaled = sk_scale(word_vectors.vectors.astype('double'))

    # Then standardize features
    X_std = StandardScaler().fit_transform(X_scaled)

     # Using PCA to remove columns (features) which have less co-relation
    n_clusters = 3
    word_vectors_pca = PCA(n_components=n_clusters)
    pca_matrix = word_vectors_pca.fit_transform(X_std)

    # Initialize dataframe that will hold all calculated output during clustering
    words_df = pd.DataFrame({
                            'words': word_vectors.index_to_key,
                            'vectors': list(X_std),
                            'pca_values': list(pca_matrix)
                        })

    # Define clustering model (KMeans algo)
    kmeans_model = KMeans(**kmeans_params)
    kmeans_model = kmeans_model.fit(pca_matrix)

    # Generate cluster predictions for each sample (word)
    X_ = np.stack(words_df.pca_values.to_numpy().ravel()).reshape(-1, n_clusters)
    words_df['predicted_cluster'] = kmeans_model.predict(X_)

    # Use the model to translate each sample's features into "model-space"/"cluster-space"
    to_cluster_space = lambda x: kmeans_model.transform([x]).flatten()
    words_df['cluster_space'] = words_df.pca_values.apply(to_cluster_space)


    # #### Prep for Similarity Scoring ####

    # ## Declare constants

    # # Setup a matrix for comparisons to the coordinates of each cluster's center
    # cluster_space_centers = cluster_model.cluster_centers_[:, np.newaxis, :]

    # # Define specs about the clustering process
    # cluster_sizes = np.bincount(cluster_model.predicted_cluster)
    # max_cluster_size = max(cluster_sizes)

    # # Boolean mask used for filtering and reshaping
    # masking_func = lambda x, y: y < cluster_sizes[x]
    # cluster_space_mask = np.ma.fromfunction(masking_func, 
    #                                             (n_clusters, max_cluster_size), dtype=int)

    # # Build a transformation matrix with the cluster-space data
    # grouped_cluster_spaces = words_df.groupby(by='predicted_cluster').cluster_space \
    #                                     .apply(lambda x: explodeAndPad(x, cluster_sizes))
    # cluster_space_matrix = np.stack(grouped_cluster_spaces.to_numpy().ravel())


    # ################## Begin "scoring pipeline" ##################


    # # Structure query data. Contains every sample separated by its predicted cluster value
    # cluster_queries = generateClusterQueries(cluster_space_matrix)

    # # Compute the euclidean distance between all clusters and those in each predicted cluster group
    # cluster_distances = np.array([ nd_EuclideanDistance(cluster_space_matrix, cluster_queries[i]).T ## DON'T LIKE THIS TRANSPOSE!!!
    #                                     for i in range(n_clusters) ])

    # # Average the distances acquired above
    # avg_cluster_dists = np.array([np.mean(dist, axis=0) for dist in cluster_distances])

    # # Determine the std for each cluster
    # cluster_distance_stds = np.array([np.std(cluster_distances[i]) for i in range(n_clusters)])

    # # Get the distance of each sample to each cluster's center (a very stong indication of similarity)
    # predicted_distance_diffs = np.array([nd_EuclideanDistance(cluster_space_centers,
    #                                                     cluster_queries[i]).T for i in range(n_clusters)])

    # # Generate weights for the confidence level based on the distance between clusters
    # transformed_weight = np.dot(avg_cluster_dists,
    #                                 scalePredictionDistance(cluster_distances))[:, np.newaxis, :]

    # # Build weights based on the spread of the clusters and how "wide" of an area they cover
    # prediction_weights = cluster_space_matrix * np.reciprocal(cluster_distance_stds)

    # # Calculate the model's average error using the distance of each sample and each cluster's average distance
    # transformed_error = np.array([nd_EuclideanDistance(cluster_space_matrix,
    #                                                     avg_cluster_dists[:, np.newaxis, :]).T
    #                                 for i in range(n_clusters)])

    # # Build an error score for the model's transformation
    # prediction_error = np.divide(predicted_distance_diffs, (transformed_error * transformed_weight))

    # # Finally, combine the overall predictions' error and weight to build a confidence score
    # prediction_confidence = np.reciprocal((prediction_error))  * prediction_weights

    # # Store the relative similarities between each cluster
    # global_cluster_similarity = np.divide(1, avg_cluster_dists, out=np.zeros_like(avg_cluster_dists),
    #                                     where=avg_cluster_dists!=0)

    # # Also maintain a similarity score with each cluster for every sample
    # sample_similarity = np.divide(1, cluster_distances, out=np.zeros_like(cluster_distances),
    #                                     where=cluster_distances!=0)
    

    # ################## End "scoring pipeline" ##################

    # words_df.update({
    #     'confidence_score': prediction_confidence,
    #     'global_similarity': global_cluster_similarity,
    #     'sample_similarity': sample_similarity
    # })

    # return words_df, cluster_model

    return words_df, kmeans_model


In [None]:
word_predictions, cluster_model = v2_SentimentGenerator(WORD_VECS)

In [None]:
word_predictions.predicted_cluster.value_counts()

### Begin Debugging

In [None]:
            ##### Define independent constants #####

# Cluster data specs/sizes
cluster_sample_sizes = np.bincount(word_predictions.predicted_cluster)
n_clusters, n_features = cluster_model.cluster_centers_.shape
n_samples = sum(cluster_sample_sizes)
smallest_cluster, largest_cluster = min(cluster_sample_sizes), max(cluster_sample_sizes)
cluster_space_dims = (n_clusters, largest_cluster, n_features)


# Setup a matrix for comparisons to the coordinates of each cluster's center
cluster_space_centers = cluster_model.cluster_centers_[:, np.newaxis, :]


# Boolean mask used for filtering and reshaping
masking_func = lambda x, y: y < cluster_sample_sizes[x]
cluster_space_mask = np.ma.fromfunction(masking_func, (n_clusters, largest_cluster), dtype=int)


# Utility function to undo the transformation necessary for scaling 3d matricies
restoreMat3d = lambda mat: mat.reshape(*cluster_space_dims)

In [None]:
### Build a transformation matrix with the cluster-space data 

# Group the data by the predicited cluster value
cluster_groups = word_predictions.groupby(by='predicted_cluster').cluster_space \
                                    .apply(lambda x: explodeAndPad(x, cluster_sample_sizes))


# Convert the grouped data into a usable numpy array
cluster_space_matrix = np.stack(cluster_groups.to_numpy().ravel())


# Structure query data. Contains every sample separated by its predicted cluster value
cluster_queries = generateClusterQueries(cluster_space_matrix)


# Compute the euclidean distance between all samples and the entire space
sample_distances = calcDistance(cluster_queries, cluster_space_matrix)


description = f'''
cluster_groups:     {cluster_groups.shape}
cluster_space_mat   {cluster_space_matrix.shape}
cluster_queries:    {cluster_queries.shape}
sample_distances:   {sample_distances.shape}
'''

print(description)

In [None]:

                ########## Basic Stats ##########

### Averages ###

# Average the distances acquired above to produce approximate cluster centers
avg_observed_distances = np.mean(sample_distances, axis=1)

# Using the modeled cluster space, find the average "distances" per cluster (all will be [3,3])
avg_model_distances = np.mean(cluster_space_matrix, axis=1)


### Standard Deviation ###

# Determine the std for each cluster
observed_distances_std = np.std(sample_distances, axis=1)

# Compute the std for each cluster using the cluster-space
model_distances_std = np.std(cluster_space_matrix, axis=1)


### Variance ###

# Determine the var of each cluster
observed_distances_var = np.var(sample_distances, axis=1)

# Determine the var of each cluster
model_distances_var = np.var(cluster_space_matrix, axis=1)


description = f'''
Averages:
    observed/sample intra-population distances:     {avg_observed_distances.shape}
    model/calculated intra-population distances:    {avg_model_distances.shape}

Standard Devs:
    observed/sample intra-population distances:     {observed_distances_std.shape}
    model/calculated intra-population distances:    {model_distances_std.shape}

Variance:
    observed/sample intra-population distances:     {observed_distances_var.shape}
    model/calculated intra-population distances:    {model_distances_var.shape}
'''

print(description)

In [None]:

                    ######## Differencing ########


# Get the distance of each sample to each cluster's center (a very stong indication of similarity)
model_center_diffs = calcDistance(cluster_queries, cluster_space_centers)

# Get the distance of every sample from the average of their combined distances
observed_center_diffs = calcDistance(cluster_queries, np.expand_dims(avg_model_distances, axis=1))

# Calculate the difference in average distances between the observed/calculated version and the one provied 
# by the clustering model
avg_distance_diffs = calcDistance(avg_model_distances, avg_observed_distances)



description = f'''
model_center_diffs:     {model_center_diffs.shape}
observed_center_diffs:  {observed_center_diffs.shape}              
avg_distance_diffs      {avg_distance_diffs.shape}
'''

print(description)

In [None]:

# Set up matrices for the following calculations

# Invert the sample distance's to each cluster
center_similarity = np.linalg.pinv(model_center_diffs)


# Utilize KMeans' intertia property
samples_distribution = [(x/n_samples) for x in cluster_sample_sizes]

cluster_distance_metric = [x*cluster_model.inertia_ for x in samples_distribution]


# Build an inertia value by tracing KMeans' implementation
samples_df = np.stack(word_predictions.pca_values.to_numpy().ravel())
shortest_squared_distances = np.square(np.apply_along_axis(min, axis=1, arr=samples_df))
observed_inertia = np.sum(shortest_squared_distances)

observed_distance_metric = [x*observed_inertia for x in samples_distribution]


description = f'''
center_similarity:          {center_similarity.shape}
model_intertia:             {cluster_model.inertia_}
cluster_distance_metric:    {cluster_distance_metric}
observed_inertia:           {observed_inertia}
observed_distance_metric:   {observed_distance_metric}
'''

print(description)

In [None]:

### Define distance/observable

# Calculate the model's weights per-feature & per-cluster using the distance of each sample and each cluster's location

inv_observed_dists_std = np.expand_dims(np.linalg.pinv(observed_distances_std), axis=1)
observed_distance_weights = matrixMult3d(sample_distances, inv_observed_dists_std)


# Determine the distance between the "sample-space" and the "cluster-space"
distances_diff = calcDistance(sample_distances, cluster_space_matrix)

inv_observed_dists_var = np.expand_dims(np.linalg.pinv(observed_distances_var), axis=1)
sample_distance_error_1 = matrixMult3d(observed_center_diffs, inv_observed_dists_var)
sample_distance_error_2 = matrixDot3d(sample_distance_error_1, observed_distance_weights)
sample_distance_error_1 = matrixMult3d(np.moveaxis(center_similarity, 2, 1), sample_distance_error_2)
sample_distance_error = calcDistance(distances_diff, sample_distance_error_1)


# # Build an error score for the model's transformation
inv_observed_dists_var = np.expand_dims(np.linalg.pinv(observed_distances_var), axis=1)
weighted_observed_dists_err = matrixMult3d(observed_distance_weights, np.expand_dims(avg_distance_diffs, 1))
weighted_observed_dists_err += matrixMult3d(sample_distances, sample_distance_error.reshape(3, 3, -1))


description = f'''
observed_distance_weights:      {observed_distance_weights.shape}
distances_diff:                 {distances_diff.shape}
sample_distance_error:          {sample_distance_error.shape}
weighted_observed_dists_err:    {weighted_observed_dists_err.shape}
'''

print(description)

In [None]:

### Prediction weights ###

# Build weights based on the spread of the clusters and how "wide" of an area they cover
inv_model_center_vars = np.expand_dims(np.linalg.pinv(model_distances_var), axis=1)

tmp_global_weight = matrixMult3d(model_center_diffs, inv_model_center_vars)
global_prediction_weight = matrixDot3d(tmp_global_weight, weighted_observed_dists_err)


inv_center_similarity = np.linalg.pinv(center_similarity)
prediction_weights_1 = matrixMult3d(center_similarity, inv_model_center_vars)
prediction_weights_2 = matrixMult3d(inv_center_similarity, np.expand_dims(avg_distance_diffs, 1))
prediction_weights = matrixDot3d(prediction_weights_1, prediction_weights_2.reshape(3, 3, -1))

### Prediction errors ###

# Get the dot product of the model's cluster's center and the each samples distance from the clusters
t1 = matrixDot3d(cluster_space_centers, model_center_diffs)
t2 = matrixDot3d()

### Weighted errors ###

# Finally, combine the overall predictions' error and weight to build a confidence score
reciprocal_prediction_err = np.reciprocal(prediction_error)


### Confidence ###

prediction_confidence = matrixDot3d(reciprocal_prediction_err.T, prediction_error)


description = f'''
global_prediction_weight:   {global_prediction_weight.shape}
prediction_weights:         {prediction_weights.shape}
prediction_error:           {prediction_error.shape}
reciprocal_prediction_err:  {reciprocal_prediction_err.shape}
prediction_confidence:      {prediction_confidence.shape}
'''

print(description)

In [None]:
# Store the relative similarities between each cluster (global)

inv_avg_distance_err = np.linalg.pinv(avg_distance_diffs)



global_cluster_similarity = matrixMult2d(inv_avg_distance_err, weighted_distances_err)



# Maintain a similarity score with each cluster for every sample (narrow)

inv_distance_stds = np.linalg.pinv(observed_distances_std)

sample_similarity = matrixMult3d_2d(sample_distances, inv_distance_stds)
sample_similarity += matrixMult2d(inv_distance_stds, distances_diff)[...,np.newaxis]


distances_diff.shape, global_cluster_similarity.shape, sample_similarity.shape

In [None]:
scores = {
    'confidence_score': prediction_confidence,
    'global_similarity': global_cluster_similarity,
    'sample_similarity': sample_similarity
}

In [None]:
for k, v in scores.items():
    print(k)
    print(v.shape)
    print('\n------\n')

### End Debugging

## -------------------------------------------------------------------

In [None]:
x = cluster_queries[0]

nd_EuclideanDistance(cluster_space_matrix, np.squeeze(x, axis=0))

In [None]:
x = cluster_queries[1]

nd_EuclideanDistance(cluster_space_matrix, np.squeeze(x, axis=0))

## Similarity Scoring Function

In [None]:
opt1_cluster_df[['cluster_similarity', 'confidence_scores']].describe()

In [None]:
np.stack(opt1_cluster_df['confidence_scores'].to_numpy().ravel()).T[0][:10]

In [None]:
raw_confidence = np.stack(opt1_cluster_df['confidence_scores'].to_numpy().ravel())
sns.kdeplot(data=raw_confidence)

# Score adjustment

In [None]:
def print_stats(arr):
    for a in arr.T:
        print(stats.describe(a))
        # print()

In [None]:
mm_scaler = MinMaxScaler(feature_range=(0, 1))
mm_scaled = mm_scaler.fit_transform(np.stack(out.confidence_score.to_numpy().ravel()))
sns.kdeplot(data=(mm_scaled))
print_stats(mm_scaled)

In [None]:
std_scaler = StandardScaler()
std_scaled = std_scaler.fit_transform(np.stack(out.confidence_score.to_numpy().ravel()))
sns.kdeplot(data=std_scaled)
print_stats(std_scaled)

In [None]:
rob_scaler = RobustScaler()
rob_scaled = rob_scaler.fit_transform(np.stack(out.confidence_score.to_numpy().ravel()))
sns.kdeplot(data=rob_scaled)
print_stats(rob_scaled)

In [None]:
abs_scaler = MaxAbsScaler()
abs_scaled = abs_scaler.fit_transform(np.stack(out.confidence_score.to_numpy().ravel()))
sns.kdeplot(data=abs_scaled)
print_stats(abs_scaled)

In [None]:
pipe = Pipeline([('robust', MaxAbsScaler()), ('minmax', MinMaxScaler(feature_range=(-1,1)))])
pipe_scaled = pipe.fit_transform(np.stack(out.confidence_score.to_numpy().ravel()))
sns.kdeplot(data=pipe_scaled)
print_stats(pipe_scaled)

In [None]:
sns.kdeplot(data=(mm_scaled))
print_stats(mm_scaled)

# Testing

In [None]:
# cluster_lookup_df = words_clusters_df[['cluster_scores', 'cluster_space']]
# grouping_features = ['cluster_scores', 'cluster_space']
scoring_dimensions = ['cluster_space', 'cluster_similarity', 'confidence_score']
words_clusters_df = out[['words', 'predicted_cluster']+scoring_dimensions]
words_clusters_df = words_clusters_df.set_index('words')

def clusterGrouping(row):
    words = row.clean_tweet_words
    # words_search = np.stack(words_clusters_df.loc[words][scoring_dimensions].to_numpy().ravel()).reshape(3, -1, 3)
    words_search = words_clusters_df.loc[words][scoring_dimensions]
    distances, similarities = np.stack(words_search[['cluster_space', 'cluster_scores']].to_numpy().ravel()).reshape(2, -1, 3)
    confidences = words_search['confidence_score'].to_numpy().reshape(-1, 1)

    weighted_distances = similarities * distances * confidences

    closest_clusters = np.argmin(weighted_distances, axis=1)

    agg_weights = [[] for _ in range(3)]
    for cluster, weights in zip(closest_clusters, weighted_distances):
        agg_weights[cluster].append(weights[cluster])

    avg_weights = [np.mean(closest_weights) if closest_weights else np.nan for closest_weights in agg_weights]
    closest_cluster = np.argmin(avg_weights)

    return closest_cluster, avg_weights[closest_cluster]

In [None]:
sample_words = FILTERED_TWEETS.iloc[1].clean_tweet_words
sample_words

In [None]:
words_search = words_clusters_df.loc[sample_words][scoring_dimensions]

In [None]:
words = FILTERED_TWEETS.iloc[1].clean_tweet_words
# words_search = np.stack(words_clusters_df.loc[words][scoring_dimensions].to_numpy().ravel()).reshape(3, -1, 3)
words_search = words_clusters_df.loc[words][scoring_dimensions]
distances, similarities = np.stack(words_search[['cluster_space', 'cluster_scores']].to_numpy().ravel()).reshape(2, -1, 3)
confidences = words_search['confidence_score'].to_numpy().reshape(-1, 1)

weighted_distances = similarities * distances * confidences

closest_clusters = np.argmax(weighted_distances, axis=1)

agg_weights = [[] for _ in range(3)]
for cluster, weights in zip(closest_clusters, weighted_distances):
    agg_weights[cluster].append(weights[cluster])

avg_weights = [np.mean(closest_weights) for closest_weights in agg_weights if closest_weights]
closest_cluster = np.argmax(avg_weights)

# Visualization

In [None]:
# def kmeans_clustering(Y_sklearn, fitted):
#     """
#     This function will predict clusters on training set and plot the visuals of clusters as well.
#     """

#     plt.scatter(Y_sklearn[:, 0], Y_sklearn[:, 1],c=prediction ,s=50, cmap='viridis') # Plotting scatter plot 
#     centers2 = fitted.cluster_centers_ # It will give best possible coordinates of cluster center after fitting k-means
#     plt.scatter(centers2[:, 0], centers2[:, 1],c='black', s=300, alpha=0.6);
#     # As this can be seen from the figure, there is an outlier as well.
    
# kmeans_clustering(Y_sklearn, fitted)

In [None]:
def get_top_features_cluster(ngrams, X_std, prediction, n_feats):
    # Get unique labels, in this case {0,1}
    labels = np.unique(prediction)
    dfs = []
    for label in labels:
        id_temp = np.where(prediction==label) # Get indices for each feature corresponding to each cluster.        
        x_means = np.mean(X_std[id_temp], axis = 0) # returns average score across cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
        # features = n_grams_to_use
        best_features = [(ngrams[i], x_means[i]) for i in sorted_means] # Retrieve corresponding best features to that of best scores.
        Df = pd.DataFrame(best_features, columns = ['features', 'score'])
        dfs.append(Df) # append both the Dataframes to a list
    return dfs

dfs = get_top_features_cluster(wv.index_to_key, X_std, prediction, 100)

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[:25][0]) # Get top 25 rows of 1st Dataframe

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[:25][1]) # Get top 25 rows of 2nd Dataframe

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[:25][2]) # Get top 25 rows of 3nd Dataframe

In [None]:
def plot_features(dfs):
    """
    This function will print combined bar graphs for all the possible clusters.
    """
    fig = plt.figure(figsize=(14,12))
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.set_title("Cluster: "+ str(i), fontsize = 14)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.score, align='center', color='#40826d')
        yticks = ax.set_yticklabels(df.features)
    plt.show();

plot_features(dfs)