In [8]:
import pandas as pd
import spacy

In [10]:
df = pd.read_csv("./enron_spam_data.csv")

In [11]:
display(df)

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
...,...,...,...,...,...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29
33713,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30
33714,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30


In [6]:
df['Spam/Ham'].value_counts()

Spam/Ham
spam    17171
ham     16545
Name: count, dtype: int64

In [17]:
df['Message'].fillna(' ')

0                                                         
1        gary , production from the high island larger ...
2                   - calpine daily gas nomination 1 . doc
3        fyi - see note below - already done .\nstella\...
4        fyi .\n- - - - - - - - - - - - - - - - - - - -...
                               ...                        
33711    hello , welcome to gigapharm onlinne shop .\np...
33712    i got it earlier than expected and it was wrap...
33713    are you ready to rock on ? let the man in you ...
33714    learn how to last 5 - 10 times longer in\nbed ...
33715    hi : )\ndo you need some softwares ? i can giv...
Name: Message, Length: 33716, dtype: object

In [18]:
df['combined'] = df['Subject'].fillna(' ') + df['Message'].fillna(' ')

In [31]:
len(' '.join(df['combined']))

50773497

In [38]:
# too big! need to sample...

# first randomize the order
df_sample = df.sample(frac=0.01, ignore_index=True).copy()


In [39]:
len(' '.join(df_sample['combined']))

420408

## Spacy

In [41]:
from collections import Counter

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1*10**6

text = ' '.join(df_sample['combined'])
doc = nlp(text)

tokens = [token.text for token in doc]
print("Total tokens:", len(tokens))
print("Unique tokens:", len(set(tokens)))

word_counts = Counter(tokens)
print("Most common words:", word_counts.most_common(10))

noun_counts = Counter(token.text for token in doc if token.pos_ == "NOUN")
print("Most common nouns:", noun_counts.most_common(10))


Total tokens: 96018
Unique tokens: 10403
Most common words: [('\n', 7000), ('.', 4803), ('-', 3946), (',', 3146), ('the', 2376), (':', 1867), ('to', 1820), ('/', 1431), ('and', 1299), ('of', 1117)]
Most common nouns: [('com', 206), ('enron', 199), ('subject', 143), ('gas', 124), ('pm', 113), ('|', 112), ('time', 106), ('message', 106), ('company', 100), ('information', 97)]


## Gensim

In [48]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/amarks-b/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize

  
documents = df_sample['combined'].values
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_docs]
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)
for topic in lda_model.print_topics():
    print(topic)

## word2vec (in gensim)

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download punkt tokenizer if you haven't already
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')

# Your training data (list of sentences/documents)
data = df_sample['combined'].values

# 1. Tokenize the data (split sentences into words)
tokenized_data = [word_tokenize(sentence.lower()) for sentence in data]

# 2. Train the Word2Vec model
model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)
# - sentences: The tokenized training data.
# - vector_size: Dimensionality of the word vectors (e.g., 100 features per word).
# - window: Maximum distance between the current and predicted word within a sentence.
# - min_count: Ignores all words with total frequency lower than this.
# - workers: Number of worker threads to train the model (for faster training).

# 3. Access word vectors (example)
word = "word2vec"
if word in model.wv:
    vector = model.wv[word]
    print(f"Vector for '{word}': {vector}")
else:
    print(f"Word '{word}' not found in the vocabulary.")

# 4. Find similar words (example)
similar_words = model.wv.most_similar("sentence", topn=3)
print(f"\nWords similar to 'sentence': {similar_words}")




# 5. Save the trained model (optional)
# model.save("word2vec_model.bin")
# To load the model later:
# from gensim.models import Word2Vec
# loaded_model = Word2Vec.load("word2vec_model.bin")

## Scikit-learn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
documents = ["This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?"]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
print(tfidf_matrix.toarray())
print(vectorizer.get_feature_names_out())

## Huggingface

In [None]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
result = sentiment_pipeline("This is a fantastic movie!")
print(f"Sentiment: {result}")

question_answerer = pipeline("question-answering")
result = question_answerer(
    question="What is my name?",
    context="My name is Sarah and I live in London."
)
print(f"Answer: {result['answer']}")

In [None]:
## Word2vec