<a href="https://colab.research.google.com/github/maReins/enronEntropy/blob/main/enronEntropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reins AI Entropy Calculation over the Enron Corpus

Copyright (c) 2024 Reins AI, LLC

This project is licensed under the MIT License. See the LICENSE file for details.

## MIT License

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

In [None]:
#Extract Enron dataset from tar in gdrive and convert into CSV for further processing. Change paths where indicated.
import os
import tarfile
import email
import pandas as pd
from email.parser import Parser
from tqdm import tqdm  # for progress bar

from google.colab import drive
drive.mount('/content/gdrive')

def extract_tar(tar_path, extract_path):
    with tarfile.open(tar_path, 'r:gz') as tar:
        print("Extracting files...")
        tar.extractall(path=extract_path)
    print("Extraction complete.")

def parse_email(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()

    email_parser = Parser()
    parsed_email = email_parser.parsestr(content)

    return {
        'subject': parsed_email['subject'],
        'from': parsed_email['from'],
        'to': parsed_email['to'],
        'date': parsed_email['date'],
        'body': get_email_body(parsed_email)
    }

def get_email_body(parsed_email):
    if parsed_email.is_multipart():
        return '\n'.join(part.get_payload() for part in parsed_email.get_payload() if part.get_content_type() == 'text/plain')
    else:
        return parsed_email.get_payload()

def process_enron_data(root_dir):
    emails = []
    for root, dirs, files in tqdm(os.walk(root_dir), desc="Processing emails"):
        for file in files:
            if file.endswith('.'):  # Enron emails don't have file extensions
                file_path = os.path.join(root, file)
                try:
                    email_data = parse_email(file_path)
                    emails.append(email_data)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    return pd.DataFrame(emails)

# Paths
#add your path
tar_path = '/content/gdrive/My Drive/enron_mail_20150507.tar.gz'
extract_path = '/content/gdrive/My Drive/enron_extracted'
maildir_path = os.path.join(extract_path, 'maildir')

# Extract the tar file
extract_tar(tar_path, extract_path)

# Process the extracted emails
df = process_enron_data(maildir_path)

# Now you can use this DataFrame for your analysis
print(df.head())
print(df.info())

# Save to CSV if needed
df.to_csv('/content/gdrive/My Drive/enron_emails.csv', index=False)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Extracting files...


In [None]:
#Parse emails from csv file, clean, and create dataframe
def parse_email(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()

    email_parser = Parser()
    parsed_email = email_parser.parsestr(content)

    return {
        'subject': parsed_email['subject'],
        'from': parsed_email['from'],
        'to': parsed_email['to'],
        'date': parsed_email['date'],
        'body': get_email_body(parsed_email)
    }

def get_email_body(parsed_email):
    if parsed_email.is_multipart():
        return '\n'.join(part.get_payload() for part in parsed_email.get_payload() if part.get_content_type() == 'text/plain')
    else:
        return parsed_email.get_payload()

def process_enron_data(root_dir):
    emails = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.'):  # Enron emails don't have file extensions
                file_path = os.path.join(root, file)
                try:
                    email_data = parse_email(file_path)
                    emails.append(email_data)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    return pd.DataFrame(emails)

# Usage
root_dir = '/content/gdrive/My Drive/enron_extracted'  # Replace with your actual path
df = process_enron_data(root_dir)

# Now you can use this DataFrame for your analysis
print(df.head())
print(df.info())

# Save to CSV if needed
df.to_csv('/content/gdrive/My Drive/enron_emails_clean.csv', index=False)

In [None]:
#compute entropy and mutual information on emails by month

from scipy.stats import entropy
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from collections import defaultdict

def load_enron_data(file_path):
    # Load the Enron email dataset
    # Assuming the dataset is a CSV with columns: date, sender, recipient, subject, body
    df = pd.read_csv(file_path)
    df['date'] = pd.to_datetime(df['date'])
    return df

def calculate_entropy(text):
    vectorizer = CountVectorizer().fit([text])
    vector = vectorizer.transform([text])
    freq = vector.toarray()[0]
    freq_norm = freq / np.sum(freq)
    return entropy(freq_norm)

def calculate_mutual_information(text1, text2):
    vectorizer = CountVectorizer().fit([text1, text2])
    vector1 = vectorizer.transform([text1]).toarray()[0]
    vector2 = vectorizer.transform([text2]).toarray()[0]

    p1 = vector1 / np.sum(vector1)
    p2 = vector2 / np.sum(vector2)

    joint_p = np.outer(p1, p2)
    mi = np.sum(joint_p * np.log2(joint_p / (p1[:, np.newaxis] * p2[np.newaxis, :])))
    return mi

def analyze_enron_data(df, start_year, end_year):
    df_filtered = df[(df['date'].dt.year >= start_year) & (df['date'].dt.year <= end_year)]

    monthly_entropy = defaultdict(list)
    monthly_mi = defaultdict(list)

    for name, group in df_filtered.groupby(pd.Grouper(key='date', freq='M')):
        month_key = name.strftime('%Y-%m')

        # Calculate average entropy for the month
        entropies = group['body'].apply(calculate_entropy)
        monthly_entropy[month_key] = entropies.mean()

        # Calculate average mutual information for the month
        if len(group) > 1:
            mis = [calculate_mutual_information(group['body'].iloc[i], group['body'].iloc[i+1])
                   for i in range(len(group)-1)]
            monthly_mi[month_key] = np.mean(mis)
        else:
            monthly_mi[month_key] = 0

    return monthly_entropy, monthly_mi

def plot_metrics(monthly_entropy, monthly_mi):
    months = list(monthly_entropy.keys())
    entropy_values = list(monthly_entropy.values())
    mi_values = list(monthly_mi.values())

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

    ax1.plot(months, entropy_values, marker='o')
    ax1.set_title('Monthly Average Entropy')
    ax1.set_xlabel('Month')
    ax1.set_ylabel('Entropy')
    ax1.set_xticklabels(months, rotation=45)

    ax2.plot(months, mi_values, marker='o', color='r')
    ax2.set_title('Monthly Average Mutual Information')
    ax2.set_xlabel('Month')
    ax2.set_ylabel('Mutual Information')
    ax2.set_xticklabels(months, rotation=45)

    plt.tight_layout()
    plt.show()

# Main execution
#change path to clean csv.
file_path = '/content/gdrive/My Drive/enron_emails_clean.csv'  # Replace with your actual file path
df = load_enron_data(file_path)

start_year = 2000  # Replace with your desired start year
end_year = 2002    # Replace with your desired end year

monthly_entropy, monthly_mi = analyze_enron_data(df, start_year, end_year)
plot_metrics(monthly_entropy, monthly_mi)

FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_enron_dataset.csv'