## Testing code: Clustering ONET description 

Phai Phongthiengtham: 11/06/2018
 
***

In [22]:
#!pip install -U pyldavis
#!pip install -U spacy
#!pip install -U scikit-learn
#!pip install -U https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
#!pip install -U nltk
#!pip install -U gensim
#import nltk
#nltk.download('all')

In [23]:
import requests, re, os, json, sys, csv, time, datetime, types
import operator, curl
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

# sklearn
import sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.externals import joblib

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = list(stopwords.words('english'))

# scacy
#import spacy
#import en_core_web_sm
#nlp = en_core_web_sm.load(disable=['parser', 'tagger','ner'] )

# plotting tools
import matplotlib.pyplot as plt
%matplotlib inline

import pyLDAvis
import pyLDAvis.gensim
import pyLDAvis.sklearn

In [24]:
# read in text data
df = pd.read_csv("ONET_preprocessed.txt", sep = '\t', header = 0)
df.head()

Unnamed: 0,O*NET-SOC Code,Title,Description,CleanText
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,determine and formulate policy and provide ove...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",communicate and coordinate with management sha...
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ...",plan direct or coordinate the operation of pub...
3,11-1031.00,Legislators,"Develop, introduce or enact laws and statutes ...",develop introduce or enact law and statute at ...
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...",plan direct or coordinate advertise policy and...


### Text preprocessing

In [None]:
# string replace
def cleanup(text):
    if text == '': # allows for possibility of being empty 
        output = ''
    else:
        text = text.replace("'s", " ")
        text = text.replace("n't", " not ")
        text = text.replace("'ve", " have ")
        text = text.replace("'re", " are ")
        text = text.replace("'m","  am ")
        text = text.replace("'ll","  will ")
        text = text.replace("-"," ")
        text = text.replace("/"," ")
        text = text.replace("("," ")
        text = text.replace(")"," ")
        text = re.sub(r'[^A-Za-z ]', '', text) #remove all characters that are not A-Z, a-z or 0-9
        output = ' '.join([w for w in re.split(' ',text) if not w=='']) #remove extra spaces 
    return output  

# pre-process text
def main_preprocess(text):
    text = str(text) # make sure the input is actually string
    text = ''.join([i if ord(i) < 128 else ' ' for i in text])
    if text == '': # allows for possibility of being empty 
        output = ''
    else:
        tokens = [w.lemma_.lower() for w in nlp(cleanup(text))] # cleanup and tokenize
        output = ' '.join([w for w in tokens if not w==''])
    return output

In [None]:
df = pd.read_csv("Occupation Data.txt", sep = '\t', header = 0)
df['CleanText'] = df['Description'].apply(lambda x: main_preprocess(x))
print( df.head() )

### CountVectorizer
* "extra_remove_words" : add extra words to be removed 

In [25]:
extra_remove_words = ['use','may','include','includes']
stop_words = set(stop_words + extra_remove_words)

min_df = 0.01
max_df = 0.99
max_ngram = 1

vectorizer = CountVectorizer(stop_words = stop_words,
                             min_df = min_df, 
                             max_df = max_df,
                             ngram_range = (1,max_ngram))

vectorizer.fit(df['CleanText'])
vector = vectorizer.transform(df['CleanText'])

### LDA
* "n_components" is the number of clusters.

In [26]:
lda = LatentDirichletAllocation(n_components = 3,
                                learning_method = 'online',
                                random_state = 0,
                                batch_size = 128,
                                evaluate_every = -1)

lda.fit(vector)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=3, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

### Display clusters

In [27]:
def display_cluster(lda, vectorizer, n_words=30):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
        df_topic_keywords = pd.DataFrame(topic_keywords)
        df_topic_keywords.columns = ['word_' + str(i) for i in range(df_topic_keywords.shape[1])]
        df_topic_keywords['cluster'] = range(0,df_topic_keywords.shape[0])
        df_topic_keywords.index = ['cluster_' + str(i) for i in range(df_topic_keywords.shape[0])]
    return df_topic_keywords

In [28]:
df_cluster = display_cluster(lda, vectorizer, n_words=30)
df_cluster

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,word_10,word_11,word_12,word_13,word_14,word_15,word_16,word_17,word_18,word_19,word_20,word_21,word_22,word_23,word_24,word_25,word_26,word_27,word_28,word_29,cluster
cluster_0,system,design,control,test,separately,list,datum,conduct,engineer,assist,computer,apply,develop,air,information,patient,research,communication,process,analyze,aircraft,care,land,worker,study,animal,relate,monitor,fire,industrial,0
cluster_1,equipment,operate,material,machine,repair,perform,maintain,product,record,prepare,vehicle,set,metal,tend,install,duty,work,tool,duties,clean,hand,power,form,part,process,order,require,variety,cut,assemble,1
cluster_2,activity,coordinate,teach,plan,direct,service,operation,individual,provide,supervise,engage,research,program,worker,train,teacher,organization,primarily,public,financial,combination,manage,course,social,management,transportation,establishment,facility,business,group,2
