# Text Clustering Model

Please be aware that this notebook inherits functions located in scripts contained in the `api` directory. You may have to change paths to reflect your local directory structure. Also be wary of passing in the correct path to the data CSV file.

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import pdb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from api.text_operations import (
    clean_data,
    encode_text
)
from api.clustering import create_clusters

from pymongo import MongoClient

import import_ipynb
from api.mongodb_access import *

from scipy.spatial.distance import euclidean
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, LatentDirichletAllocation

W0607 01:38:35.145275 140253367768896 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14
[nltk_data] Downloading package stopwords to /home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


importing Jupyter notebook from /home/notebooks/api/mongodb_access.ipynb
importing Jupyter notebook from /home/notebooks/api/classy990downloads.ipynb
importing Jupyter notebook from /home/notebooks/api/filter_text.ipynb
importing Jupyter notebook from /home/notebooks/api/ParseWeb.ipynb


[nltk_data] Downloading package wordnet to /home/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
fname = 'text_clusters.csv'
tname = 'text_samples.csv'

In [3]:
year = 2017
uri = "mongodb://mongo/tweets"
client = MongoClient(uri)

# database
db = client['irs990']

# collection
dbYear = db['%s' % year]

## Clustering Code

In [8]:
if not os.path.exists(fname) or not os.path.exists(tname):
    #--------------------------------------------------------------------------#
    # Load data
    #--------------------------------------------------------------------------#
    #data = pd.read_csv('./SampleData.csv')
    #print(data.shape)
    
    # Clean NaNs
    txt_cols = [
        'EIN',
        'OrganizationName',
        '/IRS990/Desc',
        '/IRS990/ActivityOrMissionDesc',
        '/IRS990/MissionDesc'
    ]
    
    
    txt_dict = {}
    for name in txt_cols:
        txt_dict[name] = 1
        
    data = get_df( dbYear, txt_dict )
    
    data.dropna(subset=txt_cols, inplace=True)
    #print(data.shape)
    #print( data.head() )
    
    
    #--------------------------------------------------------------------------#
    # Clean text data
    #--------------------------------------------------------------------------#
    clean_df = clean_data(data)

    
    #--------------------------------------------------------------------------#
    # Get text encodings
    #--------------------------------------------------------------------------#
    print()
    encoding_dict = encode_text(clean_df)
    print()
    
    
    #--------------------------------------------------------------------------#
    # Assemble text encodings
    #--------------------------------------------------------------------------#
    X = np.array([encoding_dict[r['EIN']] for _, r in clean_df.iterrows()])
    print(X.shape)
    
    
    #--------------------------------------------------------------------------#
    # Dimensionality reduction (clustering)
    #--------------------------------------------------------------------------#
    pca_dimr = PCA(n_components= min(clean_df.shape))
    X_dimr = pca_dimr.fit_transform(X)
    expvar_1 = sum(pca_dimr.explained_variance_ratio_)
    print('\nExplained variance (clustering): %.4f'%expvar_1)
    
    
    #--------------------------------------------------------------------------#
    # Get cluster results
    #--------------------------------------------------------------------------#
    Z, gap_metrics, bc, clusters = create_clusters(X_dimr, C=500)
    clean_df['cluster'] = clusters
    
    
    #--------------------------------------------------------------------------#
    # Get cluster centroids
    #--------------------------------------------------------------------------#
    centroids = np.zeros((bc[0], X_dimr.shape[1]))
    grouped_dfs = clean_df.groupby(by='cluster')
    for label, indexes in grouped_dfs.groups.items():
        centroids[label-1] = X_dimr[indexes].mean(axis=0)
    
    
    #--------------------------------------------------------------------------#
    # Get cluster distances for each sample
    #--------------------------------------------------------------------------#
    dist_dict = {'cluster_%d'%(i+1):[] for i in range(centroids.shape[0])}

    for embed in X_dimr:
        for i, centroid in enumerate(centroids):
            dist_dict['cluster_%d'%(i+1)].append(euclidean(embed, centroid))
    
    
    #--------------------------------------------------------------------------#
    # Integrate distances with cluster dataframe
    #--------------------------------------------------------------------------#
    dist_df = pd.DataFrame.from_dict(dist_dict, orient='columns')
    dist_df = dist_df[sorted(dist_df.columns, key=lambda x: int(x.split('_')[-1]))]

    cluster_df = pd.concat([clean_df, dist_df], axis=1)
    print(cluster_df.shape)
    
    
    #--------------------------------------------------------------------------#
    # Drop unnecessary columns and export
    #--------------------------------------------------------------------------#
    drop_cols = ['Name', 'Text']
    cluster_df.drop(labels=drop_cols, axis=1, inplace=True)
    cluster_df.to_csv(fname, index=False)
    clean_df.to_csv(tname, index=False)
else:
    cluster_df = pd.read_csv(fname)
    clean_df = pd.read_csv(tname)
#------------------------------------------------------------------------------#


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0607 01:44:00.472745 140253367768896 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Graph was finalized.


I0607 01:44:00.748446 140253367768896 monitored_session.py:222] Graph was finalized.


INFO:tensorflow:Running local_init_op.


I0607 01:44:01.895481 140253367768896 session_manager.py:491] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I0607 01:44:07.895951 140253367768896 session_manager.py:493] Done running local_init_op.



(76, 512)

Explained variance (clustering): 0.2232
Scanning: 10
Scanning: 20
Scanning: 30
Scanning: 40
Scanning: 50
Scanning: 60
Scanning: 70
Scanning: 80
Scanning: 90
Scanning: 100
Scanning: 110
Scanning: 120
Scanning: 130
Scanning: 140
Scanning: 150
Scanning: 160
Scanning: 170
Scanning: 180
Scanning: 190
Scanning: 200
Scanning: 210
Scanning: 220
Scanning: 230
Scanning: 240
Scanning: 250
Scanning: 260
Scanning: 270
Scanning: 280
Scanning: 290
Scanning: 300
Scanning: 310
Scanning: 320
Scanning: 330
Scanning: 340
Scanning: 350
Scanning: 360
Scanning: 370
Scanning: 380
Scanning: 390
Scanning: 400
Scanning: 410
Scanning: 420
Scanning: 430
Scanning: 440
Scanning: 450
Scanning: 460
Scanning: 470
Scanning: 480
Scanning: 490
Scanning: 500

Cluster sizes: [  9  11  13  14  17  18  19  20  21  23  24  26  27  29  30  31  32  34
  35  36  37  38  39  40  41  42  43  45  46  48  49  50  52  53  54  55
  56  57  58  59  60  61  63  64  65  66  67  68  69  70  71  72  73  74
  75  76  77  78  79  

In [9]:
# Final cluster dataframe
cluster_df.head()

Unnamed: 0,EIN,cluster,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9
0,200509226,1,0.149693,0.416019,0.315549,0.465982,0.873888,0.679275,0.743242,0.553313,0.684305
1,954820311,8,0.465807,0.484072,0.232574,0.446269,0.604791,0.511461,0.593884,0.178793,0.19123
2,436050205,3,0.350073,0.477738,0.092696,0.382763,0.720569,0.595714,0.692833,0.31385,0.378068
3,630797410,5,0.670639,0.481698,0.621218,0.471375,0.089582,0.207935,0.689299,0.612481,0.464187
4,42662873,9,0.5989,0.525387,0.40177,0.483329,0.44093,0.421563,0.594788,0.280723,0.02811


In [2]:
# Cleaned text dataframe
clean_df.head()

NameError: name 'clean_df' is not defined

In [11]:
# new collection
dbResults = db['%s_text_clusters' % year]

# save results
put_df( dbResults, cluster_df )

Save values for 650880021.0 

True

## LDA Topic Modeling

In [12]:
#------------------------------------------------------------------------------#
def return_topics(model, feature_names, no_top_words):
    """
    Function for returning the top words of a model's topics
    """
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        w = [feature_names[i] for i in topic.argsort()[:-no_top_words-1:-1]]
        topic_dict[topic_idx] = w
    
    return topic_dict


def get_lda_topics(clean_df):
    """
    Function for converting a cleaned text dataframe (with a cluster ID column) 
    into a dataframe specifying the LDA topic words corresponding to each 
    cluster ID
    """
    assert 'cluster' in clean_df.columns
    
    tf_params = {
        'max_df': 0.95,
        'min_df': 2,
        'max_features': 1000,
        'stop_words': 'english'
    }
    lda_params = {
        'n_topics': 1, 
        'max_iter': 5, 
        'learning_method': 'online', 
        'learning_offset': 50,
        'random_state': 0
    }
    
    topic_dict = {}
    
    min_id = np.min(clean_df['cluster'])
    max_id = np.max(clean_df['cluster'])
    
    for i in np.arange(min_id, max_id+1):
        subset = clean_df[clean_df['cluster']==i]
        
        # Fit count vectorizer
        tf_vectorizer = CountVectorizer(**tf_params)
        tf = tf_vectorizer.fit_transform(subset['Text'])
        tf_feature_names = tf_vectorizer.get_feature_names()
        
        # Run LDA
        lda = LatentDirichletAllocation(**lda_params).fit(tf)
        
        # Get top words for topic
        wd = return_topics(lda, tf_feature_names, 10)
        topic_dict[i] = wd[0]
    
    # Convert topic dict to dataframe
    topic_df = pd.DataFrame.from_dict(topic_dict, orient='columns')
    topic_df.columns = ['topic_%s'%c for c in topic_df.columns]
    
    return topic_df
#------------------------------------------------------------------------------#

In [13]:
# Get topic dataframe
td = get_lda_topics(clean_df)

TypeError: __init__() got an unexpected keyword argument 'n_topics'

In [None]:
# Export topic dataframe
td.to_csv('lda_topics.csv', index=False)

In [None]:
# Visual validation
for i, row in clean_df[clean_df['cluster']==1].iterrows():
    print('\n\n' + row['Text'] + '\n\n')