In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# N_DOCUMENTS = 517402
N_DOCUMENTS = 100000

In [2]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [3]:
# Load Dataset
X = []
for i in range(1, N_DOCUMENTS):
    f = open("emails/processed/" + str(i + 1) + ".txt", "r")
    clean_string = ""
    for line in f:
        if line[:4] == "From":
            line = line.split(" ")
            clean_string += line[1].split("@")[0] + "\n"
        elif line[:2] == "To":
            line = line.split(" ")
            clean_string += line[1].split("@")[0] + "\n"
        elif line[:7] == "Subject":
            line = line.split(" ")
            clean_string += line[1]
        elif line[:10] == "X-FileName":
            break
    for line in f:
        clean_string += line.strip("\n") + "\n"
    f.close()
    X.append(clean_string)

In [4]:
lemmatizer = WordNetLemmatizer()
for i in range(len(X)):
    sent_text = nltk.sent_tokenize(X[i])
    tokenized_text = []
    stop_words = set(stopwords.words('english')) 
    stop_words.add("ect")
    stop_words.add("hou")
    stop_words.add("com")
    stop_words.add("www")
    stop_words.add("http")
    
    # Remove the numbers
    for sentence in sent_text:
        sentence = re.sub("[^a-zA-Z]", " ", sentence)
        tokenized_text += list(nltk.word_tokenize(sentence))
        
    # Remove stopwords
    X[i] = [x.lower() for x in tokenized_text if x.lower() not in stop_words and len(x) > 2]
    
    # Pos Tag    
    X[i] = nltk.pos_tag(X[i])
    
    # Lemmatize
    X[i] = list(map(lambda x: lemmatizer.lemmatize(x[0], pos = get_wordnet_pos(x[1])), X[i]))
    
    f = open("clean/"+str(i+1)+".txt", "w")
    f.write(" ".join(X[i]))
    f.close()

In [5]:
processed_emails = X

In [6]:
import numpy as np
import gensim
from gensim import corpora, models
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array

In [7]:
dictionary = gensim.corpora.Dictionary(processed_emails)
dictionary.filter_extremes(no_above=0.5)
bag_of_words = [dictionary.doc2bow(email) for email in processed_emails]

In [8]:
from gensim import corpora, models
tfidf_object = models.TfidfModel(bag_of_words)
tfidf_vectors = tfidf_object[bag_of_words]

In [9]:
from gensim.models.coherencemodel import CoherenceModel
for top in range(23, 29):
    lda_model = gensim.models.LdaMulticore(tfidf_vectors, num_topics=top, id2word=dictionary)
    print(lda_model.show_topics(formatted=False))
    coherencemodel = CoherenceModel(model=lda_model, texts = processed_emails, dictionary=dictionary, coherence='c_v')
    print(coherencemodel.get_coherence())

0.4511338325785245


Process ForkPoolWorker-18:
Process ForkPoolWorker-17:
Process ForkPoolWorker-20:
Process ForkPoolWorker-21:
Process ForkPoolWorker-19:
Process ForkPoolWorker-15:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", li

KeyboardInterrupt: 

In [None]:
from gensim.models import LsiModel
for top in range(23, 29):
    lsa_model = LsiModel(tfidf_vectors,num_topics=top, id2word=dictionary) 
    print(lsa_model.show_topics(formatted=False))
    coherencemodel = CoherenceModel(model=lsa_model, texts = processed_emails, dictionary=dictionary, coherence='c_v')
    print(coherencemodel.get_coherence())