# PORTLAND TOPIC MODELING

In [2]:
import numpy as np
import pandas as pd
import re
import sqlite3
import matplotlib.pyplot as plt
# NLP with the LDA model for topic modeling
from gensim.corpora import Dictionary
from gensim.models import LdaModel, LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from bs4 import BeautifulSoup
import sqlite3

# Processed Database

In [3]:
# Connect to the database (or create it if it doesn't exist)
connection = sqlite3.connect("../data/emails_processed.db")

# Create a cursor object to execute SQL commands
cursor = connection.cursor()

# Load the dataframe from the SQLite database
emails_df = pd.read_sql_query("SELECT * FROM emails_processed", connection)

# Close the connection
connection.close()

# Show email data
emails_df.head()

Unnamed: 0,text,message_id,date,from,to,subject,cc,bcc,mime-version,content-type,...,x-bcc,folder,origin,filename,priority,processed_text,tokens,stripped_date,datetime,dominant_topic
0,---------------------- Forwarded by Rika Imai/...,<88180.1075863689140.JavaMail.evans@thyme>,"Tue, 8 May 2001 08:37:00 -0700 (PDT)",rika.imai@enron.com,"john.forney@enron.com, mike.carson@enron.com, ...",4 Month Rolling Forecast,,,1.0,text/plain; charset=ANSI_X3.4-1968,...,,\Rob_Benson_Jun2001\Notes Folders\Notes inbox,Benson-R,rbenson.nsf,normal,forwarded by rika imainaenron on pm dan sal...,"[""['forward',"", ""'rika',"", ""'imainaenron',"", ""...","Tue, 8 May 2001 08:37:00",2001-05-08 08:37:00,8
1,great,<4460514.1075857469666.JavaMail.evans@thyme>,"Wed, 21 Jun 2000 02:01:00 -0700 (PDT)",hunter.shively@enron.com,richard.tomaski@enron.com,Re: Jim Simpson,,,1.0,text/plain; charset=us-ascii,...,,\Hunter_Shively_Jun2001\Notes Folders\Sent,Shively-H,hshivel.nsf,normal,great,"[""['great']""]","Wed, 21 Jun 2000 02:01:00",2000-06-21 02:01:00,6
2,"oohh la la. who was your ""friend""? did you g...",<2160301.1075858147494.JavaMail.evans@thyme>,"Wed, 16 Aug 2000 03:03:00 -0700 (PDT)",matthew.lenhart@enron.com,shelliott@dttus.com,Re: Re[2]:,,,1.0,text/plain; charset=us-ascii,...,,\Matthew_Lenhart_Jun2001\Notes Folders\Sent,Lenhart-M,mlenhar.nsf,normal,oohh la la who was your friend did you guys ...,"[""['oohh',"", ""'la',"", ""'la',"", ""'friend',"", ""'...","Wed, 16 Aug 2000 03:03:00",2000-08-16 03:03:00,2
3,\nAttached are the two files with this week's ...,<22847680.1075863611080.JavaMail.evans@thyme>,"Wed, 15 Aug 2001 05:46:47 -0700 (PDT)",rika.imai@enron.com,"russell.ballato@enron.com, hicham.benjelloun@e...",FW: Nuclear Rolling Forecast,,,1.0,text/plain; charset=us-ascii,...,,"\ExMerge - Benson, Robert\Inbox\Large Messages",BENSON-R,rob benson 6-25-02.PST,normal,attached are the two files with this weeks nuc...,"[""['attach',"", ""'two',"", ""'file',"", ""'week',"",...","Wed, 15 Aug 2001 05:46:47",2001-08-15 05:46:47,5
4,lm:\nWhat are your thoughts going forward........,<15012282.1075852957298.JavaMail.evans@thyme>,"Wed, 3 Oct 2001 00:35:05 -0700 (PDT)",jennifer.fraser@enron.com,larry.may@enron.com,hello,,,1.0,text/plain; charset=us-ascii,...,,\LMAY2 (Non-Privileged)\Inbox,May-L,LMAY2 (Non-Privileged).pst,normal,lmwhat are your thoughts going forward also wh...,"[""['lmwhat',"", ""'thought',"", ""'go',"", ""'forwar...","Wed, 3 Oct 2001 00:35:05",2001-10-03 00:35:05,0


# Filter Emails

- Filter emails sent to employees with `portland` in the email address 

In [4]:
# emails_df["processed_text"] = emails_df["text"].apply(extract_text)
emails_df = emails_df[emails_df["to"].str.contains(r".*portland.*@enron.com", case=False, na=False)]

# Preparation

In [5]:
emails_df['tokens'] = emails_df['tokens'].apply(lambda x: x.split())

# Create a dictionary and a corpus
dictionary = Dictionary(emails_df['tokens'])
corpus = [dictionary.doc2bow(tokens) for tokens in emails_df['tokens']]

# LDA Model

In [6]:
# Number of topics
num_topics = 5

# Train the LDA model
# lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42)
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=1000, random_state=42, workers=6)

In [7]:
# Print topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.023*""'mw',"," + 0.014*""'schedul',"," + 0.014*""'deal',"," + 0.013*""'pleas',"," + 0.009*""'price',"," + 0.009*""'need',"," + 0.009*""'call',"," + 0.008*""'time',"," + 0.007*""'make',"," + 0.007*""'sell',","
Topic 1: 0.011*""'enron',"," + 0.008*""'pleas',"," + 0.007*""'work',"," + 0.007*""'new',"," + 0.006*""'join',"," + 0.006*""'portland',"," + 0.006*""'us',"," + 0.005*""'email',"," + 0.005*""'park',"," + 0.005*""'compani',","
Topic 2: 0.077*""'term',"," + 0.073*""'epmi',"," + 0.061*""'p',"," + 0.045*""'short',"," + 0.039*""'total',"," + 0.035*""'long',"," + 0.028*""'california',"," + 0.026*""'northwest',"," + 0.026*""'southwest',"," + 0.026*""'palo',","
Topic 3: 0.016*""'enron',"," + 0.009*""'pleas',"," + 0.006*""'team',"," + 0.006*""'question',"," + 0.005*""'pm',"," + 0.005*""'work',"," + 0.005*""'employe',"," + 0.005*""'meet',"," + 0.005*""'provid',"," + 0.005*""'would',","
Topic 4: 0.014*""'trade',"," + 0.011*""'inform',"," + 0.009*""'market',"," + 0.009*""'report',","

# Coherence

In [8]:
# Compute Coherence Score (c_v)
coherence_model_lda = CoherenceModel(model=lda_model, texts=emails_df['tokens'], dictionary=dictionary, coherence='c_v', processes=6)
coherence_score = coherence_model_lda.get_coherence()
print(f"Coherence Score (c_v): {coherence_score}")

Coherence Score (c_v): 0.5035425310078454


# Ranked Topics

In [9]:
# Step 1: Calculate topic importance (weights)
topic_importance = [0] * lda_model.num_topics
for doc in corpus:
    for topic_id, weight in lda_model[doc]:
        topic_importance[topic_id] += weight
topic_importance = [weight / len(corpus) for weight in topic_importance]  # Normalize by number of documents

# Step 2: Combine importance and terms into a DataFrame
num_words = 10  # Number of top words to show per topic
ranked_topics_data = []

for topic_id, importance in enumerate(topic_importance):
    topic_terms = lda_model.print_topic(topic_id, topn=num_words)
    ranked_topics_data.append({
        'Topic': topic_id,
        'Importance': importance,
        'Terms': topic_terms
    })

# Create DataFrame and sort by importance
ranked_topics_df = pd.DataFrame(ranked_topics_data).sort_values(by='Importance', ascending=False)
ranked_topics_df

Unnamed: 0,Topic,Importance,Terms
0,0,0.397583,"0.023*""""'mw',"","" + 0.014*""""'schedul',"","" + 0.0..."
3,3,0.218154,"0.016*""""'enron',"","" + 0.009*""""'pleas',"","" + 0...."
1,1,0.187429,"0.011*""""'enron',"","" + 0.008*""""'pleas',"","" + 0...."
4,4,0.126463,"0.014*""""'trade',"","" + 0.011*""""'inform',"","" + 0..."
2,2,0.060127,"0.077*""""'term',"","" + 0.073*""""'epmi',"","" + 0.06..."


- The first 3 topics reference energy with `mw` and `stwbom`
- Markets, trading, merger, palo verde, southwest, power common

In [10]:
# Add new columns for each term and weight
for i in range(1, num_words + 1):
    ranked_topics_df[f'Term {i}'] = None
    ranked_topics_df[f'Term {i} Weight'] = None

# Populate the new columns with terms and weights
for idx, row in ranked_topics_df.iterrows():
    terms_weights = row['Terms'].split(' + ')
    for i, term_weight in enumerate(terms_weights):
        weight, term = term_weight.split('*')
        ranked_topics_df.at[idx, f'Term {i+1}'] = term.strip("'\",")
        ranked_topics_df.at[idx, f'Term {i+1} Weight'] = float(weight)

ranked_topics_df.reset_index(inplace=True, drop=True)
ranked_topics_df.to_csv("../data/ranked_topics.csv", index=False)
ranked_topics_df


Unnamed: 0,Topic,Importance,Terms,Term 1,Term 1 Weight,Term 2,Term 2 Weight,Term 3,Term 3 Weight,Term 4,...,Term 6,Term 6 Weight,Term 7,Term 7 Weight,Term 8,Term 8 Weight,Term 9,Term 9 Weight,Term 10,Term 10 Weight
0,0,0.397583,"0.023*""""'mw',"","" + 0.014*""""'schedul',"","" + 0.0...",mw,0.023,schedul,0.014,deal,0.014,pleas,...,need,0.009,call,0.009,time,0.008,make,0.007,sell,0.007
1,3,0.218154,"0.016*""""'enron',"","" + 0.009*""""'pleas',"","" + 0....",enron,0.016,pleas,0.009,team,0.006,question,...,work,0.005,employe,0.005,meet,0.005,provid,0.005,would,0.005
2,1,0.187429,"0.011*""""'enron',"","" + 0.008*""""'pleas',"","" + 0....",enron,0.011,pleas,0.008,work,0.007,new,...,portland,0.006,us,0.006,email,0.005,park,0.005,compani,0.005
3,4,0.126463,"0.014*""""'trade',"","" + 0.011*""""'inform',"","" + 0...",trade,0.014,inform,0.011,market,0.009,report,...,manag,0.008,group,0.008,power,0.007,custom,0.007,servic,0.007
4,2,0.060127,"0.077*""""'term',"","" + 0.073*""""'epmi',"","" + 0.06...",term,0.077,epmi,0.073,p,0.061,short,...,long,0.035,california,0.028,northwest,0.026,southwest,0.026,palo,0.026


# Dominant Topic

In [11]:
def get_dominant_topic(lda_model, corpus):
    dominant_topics = []
    for doc_bow in corpus:
        topic_probs = lda_model.get_document_topics(doc_bow)
        dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
        dominant_topics.append(dominant_topic)
    return dominant_topics

emails_df['dominant_topic'] = get_dominant_topic(lda_model, corpus)
# Showing that dominant_topic is added to the DataFrame and contains an integer value
emails_df['dominant_topic']

132       0
366       0
444       0
547       1
686       1
         ..
250245    0
250455    3
250758    1
250898    3
250906    4
Name: dominant_topic, Length: 1572, dtype: int64

# Visualize

In [12]:
# Save the model
lda_model.save('enron_lda_model')

# Visualize
vis = gensimvis.prepare(lda_model, corpus, dictionary)

# Save as HTML
pyLDAvis.save_html(vis, 'lda_viz_portland.html')

# Show
pyLDAvis.display(vis)
