In [None]:
from inspect import getsourcefile
import os.path as path, sys
current_dir = path.dirname(path.abspath(getsourcefile(lambda:0)))
sys.path.append( "/".join(current_dir.split('/')[:-3]))

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus.reader import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag, word_tokenize, sent_tokenize
import pyLDAvis
import pyLDAvis.sklearn  
from langdetect import detect , detect_langs
import tqdm
from stacey.path import get_output_file
import stacey 
%matplotlib inline

In [None]:
tqdm.tqdm.pandas()

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def tag_stopwords(stopwords):
    return  [stemmer.stem(lemma.lemmatize(pos[0], get_wordnet_pos(pos[1]))).lower()+"_"+get_wordnet_pos(pos[1]) 
             for sample, position in stopwords 
             for idx, pos in enumerate(pos_tag(word_tokenize(sample)))
             if idx == position and get_wordnet_pos(pos[1]) not in [wordnet.ADJ,wordnet.ADV]]


def get_us_names():
    df_names= pd.read_csv('us-names.txt', header=None)
    return df_names[0].apply(lambda x: x.lower()).tolist()

def get_stopwords(extra=[],exception_words=[]):
    us_names = get_us_names()

    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(us_names)
    stopwords.extend(extra)
    for word in exception_words:
        try:
            stopwords.remove(word)
        except:
            pass
    stopwords.sort()
    return stopwords

def remove_stopwords(words,stopwords):
    if len(stopwords)>0 and len(words)>0:
        return np.delete(words,np.where(np.isin(np.char.lower(words), stopwords)))
    return words

    
def lemmatize(tagged_words,lowercase, min_len):
    
    for word, tag in tagged_words:
        if word.isalpha() and len(word) >= min_len:
            
            wordnet_pos = get_wordnet_pos(tag)
            
            if wordnet_pos == wordnet.ADJ or wordnet_pos == wordnet.ADV:
                continue
            
            lemmatized = lemma.lemmatize(word, wordnet_pos)
            
            if lowercase:
                lemmatized =  lemmatized.lower()
            yield (lemmatized, wordnet_pos)
    
def tag_and_stem(corpus, lowercase=True, min_len=3, stopwords=[], tagged_stopwords=[]):
    sentences = sent_tokenize(corpus)
    tokens = [word for sent in sentences for word in word_tokenize(sent)]
    tokens = remove_stopwords(tokens, stopwords)
    pos_tags = pos_tag(tokens)
    tagged = [item for item in lemmatize(pos_tags, lowercase, min_len)]
    stemmed = [stemmer.stem(word)+'_'+pos for word,pos in tagged if word.isalpha()]
    stemmed = remove_stopwords(stemmed,tagged_stopwords)
    return ' '.join(stemmed)



In [None]:
df = pd.read_csv(get_output_file('clean_aurea_smb_mails_essentials.csv'))

In [None]:
custom_names = []
unique_names = df['Assignee Name'].unique()
for full_name in unique_names:
    for name in full_name.split():
        if name.isalpha() and len(name)>2:
            custom_names.append(name.lower())
    

In [None]:
stemmer = SnowballStemmer('english')
lemma = WordNetLemmatizer()
extra = ['xls', 'xlsx', 'google']+custom_names
stopwords = stacey.stopwords.get_stopwords(extra)

word_samples= [('please',0), ('thanks',0), ('email',0), ('timestamp',0), 
               ('today',0), ('team',0), ('one',0), ('Please do it',0), 
               ('could',0), ('hello',0), ('group',0), ('let me know',0),('let me know',2),
              ('address', 0), ('concern',0), ('question',0), ('aircall',0)]
               
              
tagged_stopwords = tag_stopwords(word_samples)+[]
print(tagged_stopwords,extra)

In [None]:
df['stemmed_comments'] = df['clean_comments'].progress_apply(lambda x: tag_and_stem(x,stopwords=stopwords, tagged_stopwords=tagged_stopwords))

In [None]:
vectorizer = CountVectorizer(min_df = 0.02, max_df=0.15, analyzer = "word",ngram_range=(1,2))
cvz = vectorizer.fit_transform(df['stemmed_comments'])

vocab = vectorizer.get_feature_names()
print("Total Vocab: {}".format(len(vocab)))

In [None]:
print('terms: {}'.format(vocab))

In [None]:
perplexity = []

k_range = range(1,20)
for n in tqdm.tqdm(k_range):
    lda= LDA(random_state=44, n_components = n,max_iter=50, learning_method='online', learning_offset=60.)
    lda.fit(cvz)
    perplexity.append(lda.perplexity(cvz))    

In [None]:

fig, ax1 = plt.subplots()

ax1.plot(k_range, perplexity, color='b')
ax1.set_xlabel('Cluster')
ax1.set_ylabel('Perplexity', color='b')
ax1.tick_params('y', colors='b')

fig.tight_layout()
plt.show()


In [None]:
n_components = 7# np.array(perplexity).argsort()[0]+1
print("Number of clusters: {}".format(n_components))

In [None]:
import pyLDAvis
import pyLDAvis.sklearn  

pyLDAvis.enable_notebook()
lda_model = LDA(random_state=44, n_components=n_components,max_iter=50,learning_method="online", learning_offset=60.)
transformed = lda_model.fit_transform(cvz)
panel = pyLDAvis.sklearn.prepare(lda_model, cvz, vectorizer, mds='tsne',sort_topics=True, R=15)
panel

In [None]:
topic_info = panel.topic_info
top10_terms = topic_info.groupby(['Category','Term'])['Freq'].sum().reset_index().sort_values(['Category', 'Freq'], ascending=[True, False]).groupby('Category').head(10)

In [None]:
top10terms = list(top10_terms.groupby('Category')['Term'].apply(list).values)

In [None]:
df_sub_categories = pd.DataFrame()
   
prob = [max(proba) for proba in transformed]

terms = [top10terms[cluster] for cluster in transformed.argsort()[:,::-1][:,0]]

df['cluster'] = transformed.argsort()[:,::-1][:,0].astype(str)

df['cluster_terms'] = terms

df['cluster_terms'] = df['cluster_terms'].apply(lambda x: '\n'.join(x))

df['cluster_probability'] = prob

dfg = df.groupby('cluster')['cluster'].count()


for x,y in tqdm.tqdm(dfg.iteritems()):
    temp_df = df[df['cluster']==x]
    
    try:
        _vectorizer = CountVectorizer(min_df=0.02,max_df=0.15, ngram_range=(1,2), analyzer = "word")
        _cvz = _vectorizer.fit_transform(temp_df.stemmed_comments)
        _vocab = _vectorizer.get_feature_names()
        _perplexity = []

        for n in range(1,10):
    
            _lda= LDA(random_state=44, n_components = n, learning_method='online', learning_decay=0.6, learning_offset=10.)
            _lda.fit(_cvz)
            _perplexity.append(_lda.perplexity(_cvz))

        _n_components = np.array(_perplexity).argsort()[0]+1    
        print("{} subclusters for cluster {}".format(_n_components, x))

        if _n_components >1:
            _lda_model = LDA(random_state=44, n_components=_n_components,learning_decay=0.6,learning_offset=10.,learning_method="online")
            _transformed = _lda_model.fit_transform(_cvz)
            _prob = [max(proba) for proba in _transformed]

            _cluster_words = []
            for word_list in _lda_model.components_.argsort()[:,::-1][:,:10]:
                _cluster_words.append(np.array(_vocab)[word_list.tolist()])

            _terms = [_cluster_words[cluster] for cluster in _transformed.argsort()[:,::-1][:,0]]

            temp_df['sub_cluster'] = _transformed.argsort()[:,::-1][:,0].astype(str)

            temp_df['sub_cluster_terms'] = _terms

            temp_df['sub_cluster_terms'] = temp_df['sub_cluster_terms'].apply(lambda x: '\n'.join(x))

            temp_df['sub_cluster_probability'] = _prob
    except:
        print("Not enough vocabulary for clustering.")
        

    df_sub_categories = pd.concat([df_sub_categories, temp_df])

In [None]:
df_sub_categories.fillna('', inplace=True)
df_sub_categories['cluster_sub_cluster'] = df_sub_categories[['cluster', 'sub_cluster']].apply( lambda x: str(x['cluster']+'.'+x['sub_cluster']+'0'), axis=1)

In [None]:
df_sub_categories[df_sub_categories['cluster']=='0']

In [None]:
import datetime
timestamp = str(datetime.datetime.now()).split('.')[0]
df_save = df_sub_categories.sort_values(['cluster_sub_cluster','cluster_probability', 'sub_cluster_probability'], ascending=[True,False, False])
df_save.to_csv(get_output_file('clusters_mailessentials_'+timestamp+'.csv'),index=False,
              columns=['id','title','ticket_type','merged_from','merged_into','description','comments_new','clean_comments','cluster','sub_cluster','cluster_terms','sub_cluster_terms'])