In [None]:
import ollama
import pandas as pd
from pathlib import Path
import email

WORKING_DIRECTORY = Path().cwd()
MODEL = 'llama3.2:1b'
ollama.pull(MODEL)

In [None]:
# obtain enron dataset and extract to enron_data folder
!wget -nc https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tar.gz
!mkdir enron_data
!tar -xzf enron_mail_20150507.tar.gz -C enron_data

In [None]:
def parse_file(path:Path):

    try:
        message = email.message_from_file(open(path, 'r')).get_payload().replace('\n', ' ').replace('\t', ' ')
    except:
        message=None
    idx = [i+1 for i,x in enumerate(path.parts) if x=='maildir' ][0]
    return {
        'path':str(path),
        'owner':path.parts[idx],
        'message':message
    }


test_path = WORKING_DIRECTORY/'enron_data'/'maildir'/'ring-a'/'inbox'/'1.'
print(parse_file(test_path))


In [None]:

email_paths=(WORKING_DIRECTORY/'enron_data'/'maildir').rglob('*[.]')


In [None]:
parsed_emails = list(map(parse_file, email_paths))
parsed_emails[:5]

In [None]:
data_df = pd.DataFrame(data=parsed_emails, columns=['path', 'owner', 'message'])
data_df

In [None]:
print(data_df['message'].isnull().sum())
data_df.dropna(inplace=True)
print(data_df['message'].isnull().sum())

In [None]:
def tag_ollama(message):

    response = ollama.generate(model=MODEL, prompt=f"""
    You are a bot summarizing an email into one tag. 
    This is the message: {message}
    Reply with one tag.
"""
)

    return response['response']

In [None]:
message = data_df.sample()['message'].values[0]
response = tag_ollama(message)

In [None]:
print(message)
print(response)

In [None]:
data_df['tag'] = data_df['message'].apply(tag_ollama)