In [1]:
import re
import multiprocessing

import pandas as pd
import numpy as np 
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, normalize
import matplotlib.pyplot as plt 
from numpy.linalg import norm
from sklearn.cluster import KMeans, SpectralClustering
import seaborn as sns
import sklearn
sns.set()
from scipy.stats import norm
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.pipeline import Pipeline

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

# Prep nltk library
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
from scipy.spatial.distance import cdist, euclidean

from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
tweets_patterns = [
            r"https?:\\/\\/[a-zA-Z0-9@:%._\\/+~#=?&;-]*",
            r"\\$[a-zA-Z0-9]*",
            r"\\@[a-zA-Z0-9]*",
            r"[^a-zA-Z\\\"]"
        ]

hashtag_patterns = [
            r"\\$[a-zA-Z0-9]*",
            r"[^a-zA-Z\\\"]"
        ]

column_mappings = {
            "date": "date",
            "user_name": "username",
            "retweets": "retweets",
            "text": "tweet",
            "hashtags": "hashtags"
        }

filter_words = [
            "ukraine",
            "russia",
            "zelensky"
        ]

sentiment_map = {
            -1: "negative",
            0: "neutral",
            1: "positive"
        }

In [5]:
def load_data():
    
    import_version = 12
    import_dest = "../../data/slava_ukraini_tweets_v{version}.csv".format(version=import_version)
    raw_tweets_df = pd.read_csv(import_dest)


    # Rename columns
    tweets_df = raw_tweets_df[list(column_mappings.keys())].rename(columns=column_mappings)

    # Drop duplicate tweets
    tweets_df = tweets_df.drop_duplicates(subset='tweet', keep='first')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()
    stop_words = stopwords.words("english")

    def cleanTweet(tweet):
        tweet = tweet.lower()
        for pattern in tweets_patterns:
            tweet = re.sub(pattern, ' ', tweet)
        tweet = ' '.join([w for w in tweet.split() if len(w) > 1])
        
        trimmed_lemma_words = [lemma.lemmatize(x) for x in nltk.wordpunct_tokenize(tweet) 
                                    if x not in stop_words]
        clean_tweet = ' '.join(trimmed_lemma_words)
        
        return [lemma.lemmatize(x, nltk.corpus.reader.wordnet.VERB) 
                    for x in nltk.wordpunct_tokenize(clean_tweet) if x not in stop_words]

    def cleanHashtags(hashtags):
        if hashtags:
            hashtags = hashtags.lower()
            for pattern in hashtag_patterns:
                hashtags = re.sub(pattern, ' ', hashtags)
            hashtags = hashtags.strip() 
        return hashtags


    # Clean tweets
    tweets_df['clean_tweet_words'] = tweets_df['tweet'].apply(lambda x: cleanTweet(x))
    tweets_df['clean_tweet'] = tweets_df['clean_tweet_words'].apply(lambda x:' '.join(x))

    # Clean hashtags
    tweets_df["hashtags"] = tweets_df["hashtags"].astype(str)
    tweets_df["hashtags"] = tweets_df["hashtags"].apply(lambda x: cleanHashtags(x))

    # Convert date to datetime and extract month/year
    tweets_df['date'] = pd.to_datetime(tweets_df['date'])
    tweets_df['day'] = tweets_df['date'].dt.day
    tweets_df['month'] = tweets_df['date'].dt.month

    # Remove all tweets which do not have the provided target words
    keywords_str = '|'.join(filter_words)
    filtered_tweets_df = tweets_df.copy()
    filtered_tweets_df = filtered_tweets_df[filtered_tweets_df["clean_tweet"].str.contains(keywords_str)]

    return tweets_df, filtered_tweets_df

In [6]:
def build_word_vectors(full_tweets_df):

    # Restructure the `clean_text` column into list of list of words
    row_sentences = [row for row in full_tweets_df["clean_tweet_words"]]

    # Detect common phrases (bigrams) from a list of sentences
    phrases = Phrases(row_sentences)
    bigram = Phraser(phrases)
    sentences = bigram[row_sentences]
    
    # Initialize vector model
    num_cores = multiprocessing.cpu_count()-1
    word_vec_model = Word2Vec(workers=num_cores, min_count=1)
    
    # Establish dataset for the vector model
    word_vec_model.build_vocab(sentences)

    # Train the model
    word_vec_model.train(sentences, epochs=30, total_examples=word_vec_model.corpus_count)

    word_vectors = word_vec_model.wv

    return word_vectors

In [7]:
CLEAN_TWEETS, FILTERED_TWEETS = load_data()
    
WORD_VECS = build_word_vectors(CLEAN_TWEETS)


In [None]:
def setClusterSentiment(vectors, model, sentiment_mapping, version=0, **kwargs):
    display_terms = kwargs.get('display_terms', 20)
    print(f'** Top {display_terms} Similar Word Vectors By Cluster **\n')

    collectSamples(key_vectors=vectors, model=model, version=version)

    ## Get input
    map_string = ', '.join([f'{k} = {v}' for k, v in sentiment_mapping.items()])
    print(f"\nLabel each cluster: {map_string} (\"r\" for new samples, \"q\" to exit)")
    cluster_sentiment_defs = []
    user_input = ''
    batch_number = 0
    valid_sentiment_range = [int(k) for k in sentiment_mapping.keys()]
    while len(cluster_sentiment_defs) < len(sentiment_mapping)-1 and user_input != 'q':
        
        user_input = input(f'Cluster {len(cluster_sentiment_defs)} value:')
        if user_input == 'q':
            import sys
            sys.exit(1)

        if user_input == 'r':
            print(f'\n\nGenerating next {display_terms} samples...\n')
            batch_number += 1
            collectSamples(key_vectors=vectors, model=model, batch=batch_number, version=version)
            print('Current state:', cluster_sentiment_defs)
            print('Setting cluster:', len(cluster_sentiment_defs))
            continue

        try:
            value = int(user_input)
            if value in cluster_sentiment_defs:
                print('Already used this sentiment value.')
                continue
            elif value not in valid_sentiment_range:
                print(f'Value not in provided sentiment mapping: {valid_sentiment_range}')
                continue

            cluster_sentiment_defs.append(value)
            print(f'Set cluster {len(cluster_sentiment_defs)-1} to {value} ({sentiment_mapping[value]})')
        except ValueError:
            print(f'Must input a number in range {valid_sentiment_range}. Press q to exit')
            
    cluster_sentiment_defs.append((set(sentiment_mapping.keys()) - set(cluster_sentiment_defs)).pop())
    print((f'Set cluster {len(cluster_sentiment_defs)-1} to {cluster_sentiment_defs[-1]} ' +
            f'({sentiment_mapping[cluster_sentiment_defs[-1]]})'))
    
    return cluster_sentiment_defs



def collectSamples(key_vectors, model, batch=0, version=0):
    num_clusters = model.cluster_centers_.shape[0]
    if version == 0:
        word_vec_list = [key_vectors.similar_by_vector(model.cluster_centers_[x], 
                                                    topn=(25 * (batch+1)), 
                                                    restrict_vocab=None) 
                                                        for x in range(num_clusters)]
    elif version == 1:
        pca_values = key_vectors['pca_values']
        pca_values = np.stack(pca_values.to_numpy().ravel()).reshape(len(key_vectors), 3)

        word_vec_list = []
        for x in range(num_clusters):
            similarities = cosine_similarity(pca_values, model.cluster_centers_[x].reshape(1, -1)).flatten()

            top_indicies = np.argpartition(similarities, kth=-25, axis=0)[-25:]

            word_vec_list.append(list(zip(list(key_vectors.iloc[top_indicies]['words']), similarities[top_indicies])))


    cluster_values = np.array(list(zip(*[x[(25 * batch):] for x in word_vec_list])))
    cluster_cols = [f'Cluster {x}' for x in range(num_clusters)]

    # # Collect terms spanning multiple clusters for deciphering
    term_freq, counts = np.unique([x[0] for x in np.vstack(cluster_values)], axis=0, return_counts=True)
    unique_terms = term_freq[counts == 1]

    # Separate unique from duplicate terms
    unique_cluster_vals = [[] for _ in range(num_clusters)]
    shared_cluster_vals = defaultdict(lambda : [0] * num_clusters)

    for ix, iy in np.ndindex(cluster_values.shape[:2]):
        term, vec = cluster_values[ix, iy]
        if term in unique_terms:
            unique_cluster_vals[iy].append((term, float(vec)))
        else:
            shared_cluster_vals[term][iy] = float(vec)


    print('Unique Terms from Clusters')
    max_num_unique = max(len(c) for c in unique_cluster_vals)

    # Sort by and drop vector. Even out column lengths
    unique_cluster_terms = np.array([[val[0] for val in sorted(cluster, key=lambda x: x[1])] + 
                                        ['']*(max_num_unique-len(cluster)) # Adjust lengths
                                            for cluster in unique_cluster_vals])

    unique_terms_df = pd.DataFrame(unique_cluster_terms.T, columns=cluster_cols)
    display(unique_terms_df)

    print('\nDuplicate Terms from Clusters')
    if shared_cluster_vals:
        # Build dict for scaling
        shared_vals_df = pd.DataFrame.from_dict(shared_cluster_vals, orient='index', 
                                                    columns=cluster_cols).reset_index()

        display(shared_vals_df)
        # # Calc differences between clusters (for interpretation purposes)
        # for c in cluster_cols[1:]:
        #     shared_vals_df[f'{c} relative to {cluster_cols[0]}'] = shared_vals_df[cluster_cols[0]] - shared_vals_df[c]

        # shared_vals_df = shared_vals_df.drop(cluster_cols[1:], axis=1)

        # scaler = MinMaxScaler(feature_range=(-1, 1))
        # scaled_cols = [c for c in shared_vals_df.columns if c not in ['index', cluster_cols[0]]]

        # scaled_data = scaler.fit_transform(shared_vals_df[scaled_cols])
        # scaled_df = pd.DataFrame(scaled_data, columns=scaled_cols)
        # duplicate_terms_df = pd.merge(shared_vals_df[['index', cluster_cols[0]]], 
        #                                 scaled_df, left_index=True, right_index=True) \
        #                                     .sort_values(by=cluster_cols[0])

        # col_map = { cluster_cols[0]: f'{cluster_cols[0]} (baseline)', 'index': 'term'}
        # duplicate_terms_df = duplicate_terms_df.rename(columns=col_map).set_index('term')
        # display(duplicate_terms_df)
    else:
        print('\tNo duplicates between clusters')

## Original Method

In [None]:
def OG_cluster(word_vectors, tweets_df=None):
    cluster_model = KMeans(**{
            "n_clusters": 3,
            "max_iter": 1000,
            "n_init": 50,
            "random_state": 42
        })
    
    cluster_model = cluster_model.fit(X=word_vectors.vectors.astype('double'))

    # cluster_sentiment_defs = setClusterSentiment(word_vectors, cluster_model, sentiment_map, version=0)

    # Create a DataFrame of words with their embeddings and cluster values
    words_df = pd.DataFrame(word_vectors.index_to_key, columns=['words'])
    words_df['vectors'] = words_df.words.apply(lambda x: word_vectors[str(x)])
    words_df['predicted_cluster'] = words_df.vectors.apply(lambda x: cluster_model.predict([np.array(x)]))
    words_df.predicted_cluster = words_df.predicted_cluster.apply(lambda x: x[0])

    # words_df['cluster_value'] = [cluster_sentiment_defs[i] for i in words_df.cluster]

    # # Calculate proximity of words in each vector
    # calc_vector_nearness = lambda x: 1 / (cluster_model.transform([x.vectors]).min())
    # words_df['closeness_score'] = words_df.apply(calc_vector_nearness, axis=1)
    # words_df['sentiment_coeff'] = words_df.closeness_score * words_df.cluster_value

    # # Map sentiment encodings
    # words_df["sentiment"] = words_df["cluster_value"].map(sentiment_map)

    # words_cluster_dict = dict(zip(words_df.words, words_df.cluster_value))    

    # def getSentiment(row):
    #     words_list = row['clean_tweet_words']
    #     total = sum(int(words_cluster_dict.get(word, 0)) for word in words_list)
    #     avg = total / len(words_list)
    #     return -1 if (avg < -0.15) else 1 if (avg > 0.15) else 0

    # # Add sentiment column (integer values)
    # tweets_df["sentiment_val"] = tweets_df.apply(getSentiment, axis=1)
    # # Map integer sentiment to word value
    # tweets_df["sentiment"] = tweets_df["sentiment_val"].map(sentiment_map)


    return words_df, cluster_model

og_cluster_df, og_model = OG_cluster(WORD_VECS)
og_cluster_df.head()

In [None]:
og_cluster_df['predicted_cluster'].value_counts()

In [None]:
og_cluster_df.head()

## Option \#1

In [None]:
df = pd.DataFrame(WORD_VECS.vectors).astype('double')

# Used preprocessing module of sklearn library to scale data.
X_scaled = preprocessing.scale(df)

# I have used StandarScaler() & fit_transform() function of sklearn library to standardize features.
X_std = StandardScaler().fit_transform(X_scaled)

# std_df = pd.DataFrame(X_std)

# # base_pca = PCA(n_components = 3) # Using PCA to remove cols which has less co-relation
# word_vectors_pca = PCA(n_components = 3).fit(std_df) #fit_transform() is used to scale training data to learn parameters such as 
# pca_values = word_vectors_pca.transform(std_df)

In [None]:
base_pca = PCA(n_components = 3)
word_vectors_pca = PCA(n_components = 3).fit(X_std)

In [None]:
pca_vals = word_vectors_pca.transform(X_std)

In [None]:
z = X_std.reshape(len(X_std), 1, -1)
z.shape

In [None]:
z[0][:20]

In [None]:
z[0]

In [None]:
len(std_df.to_numpy().tolist()[0])

In [None]:
def euclidean_dist(word_vectors, query_vector):
    return cdist(word_vectors, query_vector, 'euclidean')

In [None]:

ACCEPTED_DISTANCE = 0.1

def Opt1_cluster(word_vectors, tweets_df=None):
    df = pd.DataFrame(word_vectors.vectors).astype('double')

    # Used preprocessing module of sklearn library to scale data.
    X_scaled = preprocessing.scale(df)

    # I have used StandarScaler() & fit_transform() function of sklearn library to standardize features.
    X_std = StandardScaler().fit_transform(X_scaled)

    # base_pca = PCA(n_components = 3) # Using PCA to remove cols which has less co-relation
    word_vectors_pca = PCA(n_components = 3).fit(X_std) #fit_transform() is used to scale training data to learn parameters such as 
    pca_values = word_vectors_pca.transform(X_std)
    
    words_df = pd.DataFrame({ 
                            'words': word_vectors.index_to_key, 
                            'vectors': X_std,
                            'pca_values': pca_values
                        })

    # mean & variance of the features of training set and then these parameters are used to scale our testing data.
    # As concluded using Elbow Method.
    n_clusters = 3
    kmeans = KMeans(n_clusters= n_clusters, max_iter=400)# Partition 'n' no. of observations into 'k' no. of clusters. 
    fit_kmeans = kmeans.fit(pca_values) # Fitting k-means model  to feature array

    # cluster_sentiment_defs = setClusterSentiment(words_df, fit_kmeans, sentiment_map, version=1)

    words_df['predicted_cluster'] = fit_kmeans.predict(np.stack(words_df.pca_values.to_numpy() \
                                                .ravel()).reshape(len(words_df), 3))

    to_cluster_space = lambda x: fit_kmeans.transform([x]).flatten()
    words_df['cluster_space'] = words_df.pca_values.apply(to_cluster_space)

    # grouped_cluster_space = np.flip(words_df.groupby(['predicted_cluster'], sort=False) \
    #                                 .cluster_space.apply(lambda x: np.array(x)).to_numpy())
    # cluster_space_matricies = [np.array(list(map(lambda r: r[0], x))).reshape(-1, 1) for x in grouped_cluster_space]

    def score_clusters(row):
        
        query_vec = row.cluster_space.reshape(-1, 1, 1)

        zipped_cluster_query = zip(cluster_space_matricies, query_vec)

        cluster_distances = [euclidean_dist(cluster_space, query) for (cluster_space, query) 
                                in zipped_cluster_query] # distance from this sample to all others
        
        calc_dist_to_clusters = [np.mean(d[d != 0]) for d in cluster_distances] # between this sample and all other clusters
        predict_dist_to_clusters = euclidean_dist(fit_kmeans.cluster_centers_, row.cluster_space.reshape(1, -1)).flatten()


        calculated_cluster = np.argmin(calc_dist_to_clusters)
        calculated_shortest_dist = calc_dist_to_clusters[calculated_cluster]
        predicted_shortest_dist = row.cluster_space[row.predicted_cluster]

        
        distance_diffs = (predict_dist_to_clusters - calc_dist_to_clusters)


        if row.predicted_cluster != calculated_cluster:
            error = (((predicted_shortest_dist - calc_dist_to_clusters[row.predicted_cluster]) + # Difference from predicted cluster
                        (row.cluster_space[calculated_cluster] - calculated_shortest_dist)) / 2) # Difference from calculated cluster
            distance_diffs *= (error)
            
        
        cluster_similarity_scores =  1 / np.array(calc_dist_to_clusters)
        confidence_scores = 1 / distance_diffs

        row['cluster_similarity'] = cluster_similarity_scores # similarity between this vector and each cluster
        row['confidence_scores'] = confidence_scores 
        return row
    

    # scored_clusters = words_df.apply(score_clusters, axis=1)

    # scoring_dimensions = ['cluster_space', 'cluster_scores', 'confidence_score']
    # words_clusters_df = scored_clusters[['words', 'predicted_cluster']+scoring_dimensions]
    # words_clusters_df = words_clusters_df.set_index('words')


    return words_df, fit_kmeans


opt1_cluster_df, opt1_model = Opt1_cluster(WORD_VECS)
opt1_cluster_df.head()

In [None]:
x = np.flip(opt1_cluster_df.groupby(['predicted_cluster'], sort=False) \
                                .pca_values.apply(lambda x: np.array(x)).to_numpy())

In [None]:
x[0][:20]

In [None]:
cluster_space_matricies = [np.array(list(map(lambda r: r[i], x))).reshape(-1, 1) for i, x in enumerate(grouped_cluster_space)]

In [None]:
row = opt1_cluster_df.iloc[0]

In [None]:
grouped_cluster_space = np.flip(opt1_cluster_df.groupby(['predicted_cluster'], sort=False) \
                                .cluster_space.apply(lambda x: np.array(x)).to_numpy())
cluster_space_matricies = [np.array(list(map(lambda r: r[i], x))).reshape(-1, 1) for i, x in enumerate(grouped_cluster_space)]


In [None]:
grouped_cluster_space[0][:20]

In [None]:
np.array(list(map(lambda r: r[0], grouped_cluster_space[0])))[:20]

In [None]:
np.array(list(map(lambda r: r[0], grouped_cluster_space[1])))[:20]

In [None]:
query_vec = row.cluster_space.reshape(-1, 1, 1)

zipped_cluster_query = zip(cluster_space_matricies, query_vec)

cluster_distances = [euclidean_dist(cluster_space, query) for (cluster_space, query) 
                        in zipped_cluster_query] 

In [None]:
row.cluster_space 

In [None]:
calc_dist_to_clusters = [np.mean(d[d != 0]) for d in cluster_distances] # between this sample and all other clusters
predict_dist_to_clusters = euclidean_dist(opt1_model.cluster_centers_, row.cluster_space.reshape(1, -1)).flatten()


In [None]:
opt1_model.cluster_centers_

In [None]:
predict_dist_to_clusters

In [None]:
calc_dist_to_clusters

In [None]:
a = [1, 2, 2]
b = np.array([2, 3, 4])

z = abs(a - b)
z += 2
z


In [None]:
opt1_cluster_df['predicted_cluster'].value_counts()

In [None]:
mm_scaler = MinMaxScaler(feature_range=(0, 1))
mm_scaled = mm_scaler.fit_transform(opt1_cluster_df.confidence_score.to_numpy().reshape(-1, 1))
mm_scaled

In [None]:
std_scaler = StandardScaler()
std_scaled = std_scaler.fit_transform(opt1_cluster_df.confidence_score.to_numpy().reshape(-1, 1))
std_scaled

In [None]:
rob_scaler = RobustScaler()
rob_scaled = rob_scaler.fit_transform(opt1_cluster_df.confidence_score.to_numpy().reshape(-1, 1))
rob_scaled

In [None]:
abs_scaler = MaxAbsScaler()
abs_scaled = abs_scaler.fit_transform(opt1_cluster_df.confidence_score.to_numpy().reshape(-1, 1))
abs_scaled

In [None]:
pipe = Pipeline([('robust', RobustScaler()), ('minmax', MinMaxScaler(feature_range=(0,1)))])
pipe_scaled = pipe.fit_transform(opt1_cluster_df.confidence_score.to_numpy().reshape(-1, 1))
pipe_scaled

In [None]:
sns.kdeplot(opt1_cluster_df.confidence_score)

In [None]:
sns.kdeplot(std_scaled.T[0])

In [None]:
sns.kdeplot(mm_scaled.T[0])

In [None]:
sns.kdeplot(rob_scaled.T[0])

In [None]:
sns.kdeplot(abs_scaled.T[0])

In [None]:
sns.kdeplot(pipe_scaled.T[0])

In [None]:
sns.kdeplot(opt1_cluster_df.confidence_score)

In [None]:
# cluster_lookup_df = words_clusters_df[['cluster_scores', 'cluster_space']]
# grouping_features = ['cluster_scores', 'cluster_space']
scoring_dimensions = ['cluster_space', 'cluster_scores', 'confidence_score']
words_clusters_df = opt1_cluster_df[['words', 'predicted_cluster']+scoring_dimensions]
words_clusters_df = words_clusters_df.set_index('words')

def clusterGrouping(row):
    words = row.clean_tweet_words
    # words_search = np.stack(words_clusters_df.loc[words][scoring_dimensions].to_numpy().ravel()).reshape(3, -1, 3)
    words_search = words_clusters_df.loc[words][scoring_dimensions]
    distances, similarities = np.stack(words_search[['cluster_space', 'cluster_scores']].to_numpy().ravel()).reshape(2, -1, 3)
    confidences = words_search['confidence_score'].to_numpy().reshape(-1, 1)

    weighted_distances = similarities * distances * confidences

    closest_clusters = np.argmin(weighted_distances, axis=1)

    agg_weights = [[] for _ in range(3)]
    for cluster, weights in zip(closest_clusters, weighted_distances):
        agg_weights[cluster].append(weights[cluster])

    avg_weights = [np.mean(closest_weights) if closest_weights else np.nan for closest_weights in agg_weights]
    closest_cluster = np.argmin(avg_weights)

    return closest_cluster, avg_weights[closest_cluster]

In [None]:
words = FILTERED_TWEETS.iloc[1].clean_tweet_words
# words_search = np.stack(words_clusters_df.loc[words][scoring_dimensions].to_numpy().ravel()).reshape(3, -1, 3)
words_search = words_clusters_df.loc[words][scoring_dimensions]
distances, similarities = np.stack(words_search[['cluster_space', 'cluster_scores']].to_numpy().ravel()).reshape(2, -1, 3)
confidences = words_search['confidence_score'].to_numpy().reshape(-1, 1)

weighted_distances = similarities * distances * confidences

closest_clusters = np.argmax(weighted_distances, axis=1)

agg_weights = [[] for _ in range(3)]
for cluster, weights in zip(closest_clusters, weighted_distances):
    agg_weights[cluster].append(weights[cluster])

avg_weights = [np.mean(closest_weights) for closest_weights in agg_weights if closest_weights]
closest_cluster = np.argmax(avg_weights)

In [None]:
avg_weights

In [None]:
clusterGrouping(FILTERED_TWEETS.iloc[0])

In [None]:
FILTERED_TWEETS[:10].apply(clusterGrouping, axis=1)

In [None]:
# def kmeans_clustering(Y_sklearn, fitted):
#     """
#     This function will predict clusters on training set and plot the visuals of clusters as well.
#     """

#     plt.scatter(Y_sklearn[:, 0], Y_sklearn[:, 1],c=prediction ,s=50, cmap='viridis') # Plotting scatter plot 
#     centers2 = fitted.cluster_centers_ # It will give best possible coordinates of cluster center after fitting k-means
#     plt.scatter(centers2[:, 0], centers2[:, 1],c='black', s=300, alpha=0.6);
#     # As this can be seen from the figure, there is an outlier as well.
    
# kmeans_clustering(Y_sklearn, fitted)

In [None]:
def get_top_features_cluster(ngrams, X_std, prediction, n_feats):
    # Get unique labels, in this case {0,1}
    labels = np.unique(prediction)
    dfs = []
    for label in labels:
        id_temp = np.where(prediction==label) # Get indices for each feature corresponding to each cluster.        
        x_means = np.mean(X_std[id_temp], axis = 0) # returns average score across cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
        # features = n_grams_to_use
        best_features = [(ngrams[i], x_means[i]) for i in sorted_means] # Retrieve corresponding best features to that of best scores.
        Df = pd.DataFrame(best_features, columns = ['features', 'score'])
        dfs.append(Df) # append both the Dataframes to a list
    return dfs

dfs = get_top_features_cluster(wv.index_to_key, X_std, prediction, 100)

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[:25][0]) # Get top 25 rows of 1st Dataframe

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[:25][1]) # Get top 25 rows of 2nd Dataframe

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[:25][2]) # Get top 25 rows of 3nd Dataframe

In [None]:
def plot_features(dfs):
    """
    This function will print combined bar graphs for all the possible clusters.
    """
    fig = plt.figure(figsize=(14,12))
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.set_title("Cluster: "+ str(i), fontsize = 14)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.score, align='center', color='#40826d')
        yticks = ax.set_yticklabels(df.features)
    plt.show();

plot_features(dfs)