In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec
from gensim.models import LdaModel
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/makowskitomasz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/makowskitomasz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/makowskitomasz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/makowskitomasz/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
emails_df = pd.read_json('../data/emails.json', orient='records', lines=True)

In [4]:
emails_df.shape

(248113, 17)

In [5]:
emails_df['answered'].value_counts()

answered
0    240654
1      7459
Name: count, dtype: int64

In [6]:
my_set = set()

In [10]:
df = emails_df

In [11]:
df.columns

Index(['id', 'message_id', 'subject', 'sender', 'receiver', 'day_week', 'time',
       'time_zone', 'date', 'is_reply', 'responses', 'body',
       'number_of_receivers', 'adjusted_time', 'length', 'length_num',
       'answered'],
      dtype='object')

In [12]:
df['time_one_zone'] = df['adjusted_time'].dt.time

In [13]:
df.drop(['id', 'message_id', 'time', 'time_zone', 'length', 'adjusted_time', 'sender', 'receiver'], axis=1, inplace=True)

In [14]:
df.head(2)

Unnamed: 0,subject,day_week,date,is_reply,responses,body,number_of_receivers,length_num,answered,time_one_zone
0,RE: PPA - Notice of Intent to Terminate RE: PP...,Wed,2000-12-20,0,[],\nWhat is the date of the law referening the F...,1,2283,0,09:15:00
1,Re: FERC ruling on Kern expansion Re: FERC rul...,Tue,2001-04-10,1,[],\n---------------------- Forwarded by Stephani...,1,888,0,00:55:00


In [15]:
for index, row in df.iterrows():
    print(f'Subject: {row["subject"]}')
    print(f'Day of the week: {row["day_week"]}')
    print(f'Time: {row["time_one_zone"]}')
    print(f'Date: {row["date"]}')
    print(f'Is_reply: {row["is_reply"]}')
    print(f'Responses: {row["responses"]}')
    print(f'Body: {row["body"]}')
    print(f'Number of receivers: {row["number_of_receivers"]}')
    print(f'Length_num: {row["length_num"]}')
    print(f'Answered: {row["answered"]}')
    break

Subject: RE: PPA - Notice of Intent to Terminate RE: PPA - Notice of Intent to Terminate  RE: PPA - Notice of Intent to Terminate  RE: PPA - Notice of Intent to Terminate  RE: PPA - Notice of Intent to Terminate     RE: PPA - Notice of Intent to Terminate  RE: PPA - Notice of Intent to Terminate     Notice of Intent to Terminate
Day of the week: Wed
Time: 09:15:00
Date: 2000-12-20 00:00:00
Is_reply: 0
Responses: []
Body: 
What is the date of the law referening the Furnas right to contract for=20
energy and does it apply to their ability to contract for long term capacit=
y=20
without a bid?




"RKG - Renata Kogut Gurevich" <RKG@tozzini.com.br> on 12/20/2000 04:57:08 P=
M
To: <Richard.A.Lammers@enron.com>
cc: <Andreia.Almeida@enron.com>, <James.M.Bannantine@enron.com>,=20
<Joao.Carlos.Albuquerque@enron.com>, <John.Novak@enron.com>,=20
<Jose.Bestard@enron.com>, <laine.A.Powell@enron.com>,=20
<Peter.E.Weidler@enron.com>, <Rob.G.Gay@enron.com>,=20
<Gilbert.Landsberg@shell.com.br>, <Guido.

In [16]:
df.dtypes

subject                        object
day_week                       object
date                   datetime64[ns]
is_reply                        int64
responses                      object
body                           object
number_of_receivers             int64
length_num                      int64
answered                        int64
time_one_zone                  object
dtype: object

In [17]:
"""
Data Preprocessing:
    Tokenize the text data (subject and body).
    Remove stop words, punctuation, and special characters.
    Lemmatize or stem the words to reduce them to their base form.
    Encode categorical variables (day of the week, sender, receiver) using one-hot encoding or label encoding.
    Convert date and time features into a numerical format if necessary.
"""

'\nData Preprocessing:\n    Tokenize the text data (subject and body).\n    Remove stop words, punctuation, and special characters.\n    Lemmatize or stem the words to reduce them to their base form.\n    Encode categorical variables (day of the week, sender, receiver) using one-hot encoding or label encoding.\n    Convert date and time features into a numerical format if necessary.\n'

In [18]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [19]:
df['subject'] = df['subject'].apply(preprocess_text)

In [20]:
df['body'] = df['body'].apply(preprocess_text)

In [21]:
le = LabelEncoder()
df['day_week'] = le.fit_transform(df['day_week'])

In [22]:
df['time_one_zone'] = pd.to_datetime(df['time_one_zone'], format='%H:%M:%S').dt.time

In [23]:
df.head(3)

Unnamed: 0,subject,day_week,date,is_reply,responses,body,number_of_receivers,length_num,answered,time_one_zone
0,ppa notice intent terminate ppa notice intent ...,6,2000-12-20,0,[],date law referening furnas right contract for2...,1,2283,0,09:15:00
1,ferc ruling kern expansion ferc ruling kern ex...,5,2001-04-10,1,[],forwarded stephanie millercorpenron 04102001 0...,1,888,0,00:55:00
2,there change ferc,1,2001-04-16,0,[],read two new ferc commissioner also issue cali...,1,54,0,17:08:00


In [24]:
"""
NLP Activities:
    After preprocessing, you can delve into more advanced NLP techniques:
        Feature Extraction: Use techniques like TF-IDF (Term Frequency-Inverse Document Frequency) to convert text data into numerical representations.
        Word Embeddings: Consider using pre-trained word embeddings (e.g., Word2Vec, GloVe) to capture semantic relationships between words in the text data.
        Topic Modeling: Apply techniques like Latent Dirichlet Allocation (LDA) to identify underlying topics or themes in the emails.
        Sentiment Analysis: Analyze the sentiment of the email content to understand the overall tone or mood.
"""

tfidf = TfidfVectorizer(max_features=5000)
tfidf.fit(df['body'])
tfidf_features = tfidf.transform(df['body'])

In [25]:
df['tokenized_body'] = df['body'].apply(word_tokenize)
stop_words = set(stopwords.words('english'))
df['clean_body'] = df['tokenized_body'].apply(lambda tokens: [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words])

lemmatizer = WordNetLemmatizer()
df['lemmatized_body'] = df['clean_body'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

df['lemmatized_body_str'] = df['lemmatized_body'].apply(lambda tokens: ' '.join(tokens))

In [26]:
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
tfidf_features = tfidf_vectorizer.fit_transform(df['lemmatized_body_str'])
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_features, columns=feature_names)
df = pd.concat([df, tfidf_df], axis=1)

In [27]:
df = df.dropna(subset=['lemmatized_body'])
word2vec_model = Word2Vec(sentences=df['lemmatized_body'], vector_size=100, window=5, min_count=1, workers=4)

dictionary = gensim.corpora.Dictionary(df['lemmatized_body'])
corpus = [dictionary.doc2bow(text) for text in df['lemmatized_body']]
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

In [None]:
df['time_one_zone'] = df['time_one_zone'].astype(str)

df['time_one_zone'] = pd.to_datetime(df['time_one_zone'], format='%H:%M:%S')

df['hour'] = df['time_one_zone'].dt.hour
df['minute'] = df['time_one_zone'].dt.minute
df['second'] = df['time_one_zone'].dt.second

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

In [None]:
df.drop(['subject', 'body', 'tokenized_body', 'clean_body', 'lemmatized_body', 'lemmatized_body_str', 'date', 'time_one_zone'], axis=1, inplace=True)

In [None]:
df.shape

In [None]:
df.head(3)

Unnamed: 0,day_week,is_reply,responses,number_of_receivers,length_num,answered,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,ability,able,accept,...,yet,york,youd,youll,young,youre,youve,zone,hour_sin,hour_cos
9,1.0,0.0,[],1.0,50.0,0.0,0,0,0,0,...,0,0,0,0.0,0,0,0.0,0.139079,0.258819,-0.965926
22,3.0,0.0,[],1.0,449.0,0.0,0,0,0,0,...,0,0,0,0.0,0,0,0.0,0.0,-0.5,-0.866025
24,5.0,1.0,[],1.0,1.0,0.0,0,0,0,0,...,0,0,0,0.119962,0,0,0.126451,0.0,0.5,-0.866025


In [None]:
df.to_pickle('../data/all_enron_emails_preprocessed.pkl')