# Text Clustering Model

Please be aware that this notebook inherits functions located in scripts contained in the `api` directory. You may have to change paths to reflect your local directory structure. Also be wary of passing in the correct path to the data CSV file.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import pdb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from api.text_operations import (
    clean_data,
    encode_text
)
from api.clustering import create_clusters

from scipy.spatial.distance import euclidean
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, LatentDirichletAllocation

In [None]:
fname = 'text_clusters.csv'
tname = 'text_samples.csv'

## Clustering Code

In [None]:
if not os.path.exists(fname) or not os.path.exists(tname):
    #--------------------------------------------------------------------------#
    # Load data
    #--------------------------------------------------------------------------#
    data = pd.read_csv('./SampleData.csv')
    print(data.shape)
    
    # Clean NaNs
    txt_cols = [
        'OrganizationName',
        '/IRS990/Desc',
        '/IRS990/ActivityOrMissionDesc',
        '/IRS990/MissionDesc'
    ]
    data.dropna(subset=txt_cols, inplace=True)
    print(data.shape)
    
    
    #--------------------------------------------------------------------------#
    # Clean text data
    #--------------------------------------------------------------------------#
    clean_df = clean_data(data)

    
    #--------------------------------------------------------------------------#
    # Get text encodings
    #--------------------------------------------------------------------------#
    print()
    encoding_dict = encode_text(clean_df)
    print()
    
    
    #--------------------------------------------------------------------------#
    # Assemble text encodings
    #--------------------------------------------------------------------------#
    X = np.array([encoding_dict[r['EIN']] for _, r in clean_df.iterrows()])
    print(X.shape)
    
    
    #--------------------------------------------------------------------------#
    # Dimensionality reduction (clustering)
    #--------------------------------------------------------------------------#
    pca_dimr = PCA(n_components=200)
    X_dimr = pca_dimr.fit_transform(X)
    expvar_1 = sum(pca_dimr.explained_variance_ratio_)
    print('\nExplained variance (clustering): %.4f'%expvar_1)
    
    
    #--------------------------------------------------------------------------#
    # Get cluster results
    #--------------------------------------------------------------------------#
    Z, gap_metrics, bc, clusters = create_clusters(X_dimr, C=500)
    clean_df['cluster'] = clusters
    
    
    #--------------------------------------------------------------------------#
    # Get cluster centroids
    #--------------------------------------------------------------------------#
    centroids = np.zeros((bc[0], X_dimr.shape[1]))
    grouped_dfs = clean_df.groupby(by='cluster')
    for label, indexes in grouped_dfs.groups.items():
        centroids[label-1] = X_dimr[indexes].mean(axis=0)
    
    
    #--------------------------------------------------------------------------#
    # Get cluster distances for each sample
    #--------------------------------------------------------------------------#
    dist_dict = {'cluster_%d'%(i+1):[] for i in range(centroids.shape[0])}

    for embed in X_dimr:
        for i, centroid in enumerate(centroids):
            dist_dict['cluster_%d'%(i+1)].append(euclidean(embed, centroid))
    
    
    #--------------------------------------------------------------------------#
    # Integrate distances with cluster dataframe
    #--------------------------------------------------------------------------#
    dist_df = pd.DataFrame.from_dict(dist_dict, orient='columns')
    dist_df = dist_df[sorted(dist_df.columns, key=lambda x: int(x.split('_')[-1]))]

    cluster_df = pd.concat([clean_df, dist_df], axis=1)
    print(cluster_df.shape)
    
    
    #--------------------------------------------------------------------------#
    # Drop unnecessary columns and export
    #--------------------------------------------------------------------------#
    drop_cols = ['Name', 'Text']
    cluster_df.drop(labels=drop_cols, axis=1, inplace=True)
    cluster_df.to_csv(fname, index=False)
    clean_df.to_csv(tname, index=False)
else:
    cluster_df = pd.read_csv(fname)
    clean_df = pd.read_csv(tname)
#------------------------------------------------------------------------------#

In [None]:
# Final cluster dataframe
cluster_df.head()

In [None]:
# Cleaned text dataframe
clean_df.head()

## LDA Topic Modeling

In [None]:
#------------------------------------------------------------------------------#
def return_topics(model, feature_names, no_top_words):
    """
    Function for returning the top words of a model's topics
    """
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        w = [feature_names[i] for i in topic.argsort()[:-no_top_words-1:-1]]
        topic_dict[topic_idx] = w
    
    return topic_dict


def get_lda_topics(clean_df):
    """
    Function for converting a cleaned text dataframe (with a cluster ID column) 
    into a dataframe specifying the LDA topic words corresponding to each 
    cluster ID
    """
    assert 'cluster' in clean_df.columns
    
    tf_params = {
        'max_df': 0.95,
        'min_df': 2,
        'max_features': 1000,
        'stop_words': 'english'
    }
    lda_params = {
        'n_topics': 1, 
        'max_iter': 5, 
        'learning_method': 'online', 
        'learning_offset': 50,
        'random_state': 0
    }
    
    topic_dict = {}
    
    min_id = np.min(clean_df['cluster'])
    max_id = np.max(clean_df['cluster'])
    
    for i in np.arange(min_id, max_id+1):
        subset = clean_df[clean_df['cluster']==i]
        
        # Fit count vectorizer
        tf_vectorizer = CountVectorizer(**tf_params)
        tf = tf_vectorizer.fit_transform(subset['Text'])
        tf_feature_names = tf_vectorizer.get_feature_names()
        
        # Run LDA
        lda = LatentDirichletAllocation(**lda_params).fit(tf)
        
        # Get top words for topic
        wd = return_topics(lda, tf_feature_names, 10)
        topic_dict[i] = wd[0]
    
    # Convert topic dict to dataframe
    topic_df = pd.DataFrame.from_dict(topic_dict, orient='columns')
    topic_df.columns = ['topic_%s'%c for c in topic_df.columns]
    
    return topic_df
#------------------------------------------------------------------------------#

In [None]:
# Get topic dataframe
td = get_lda_topics(clean_df)

In [None]:
# Export topic dataframe
td.to_csv('lda_topics.csv', index=False)

In [None]:
# Visual validation
for i, row in clean_df[clean_df['cluster']==1].iterrows():
    print('\n\n' + row['Text'] + '\n\n')