### README
Place this notebook and the raw dataset package in the same directory.  
Extract the raw dataset using ...  
```
tar -zxf enron1.tar.gz
```

In [1]:
import json
import os
import pandas
import re
import shutil
import sys

In [2]:
# Install dependencies
!{sys.executable} -m pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-10.0.1-cp38-cp38-win_amd64.whl (20.3 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-10.0.1


In [3]:
input_path = 'enron1'
output_path = 'enron1_output'
json_output_path = os.path.join(output_path, 'JSON')
xml_output_path = os.path.join(output_path, 'XML')
csv_output_path = os.path.join(output_path, 'CSV')
parquet_output_path = os.path.join(output_path, 'PARQUET')

In [4]:
# Clean up
shutil.rmtree(output_path)

# Initialize
os.makedirs(json_output_path)
os.makedirs(xml_output_path)
os.makedirs(csv_output_path)
os.makedirs(parquet_output_path)

In [5]:
def files_with(n):
    for label in os.listdir(input_path):
        file_path = os.path.join(input_path, label)
        if not os.path.isdir(file_path):
            continue
        for name in os.listdir(file_path):
            if name.startswith(n) and name.endswith('.txt'):
                yield {
                    'id': name.replace('.txt', ''),
                    'label': label,
                    'path': os.path.join(file_path, name)
                }

In [6]:
def sanitized_text(text):
    # Remove all non-word characters
    text = re.sub('\W', ' ', text)
    # Remove redundant whitespaces
    text = re.sub('\s{2,}', ' ', text)
    return text

### 0. JSON

In [7]:
def dump_as_json_for_files_with(n):
    for d in files_with(n):
        
        # Initialzie
        data = { }
        
        # Read data
        with open(d['path'], 'r', encoding='ISO-8859-1') as file:
            data = {
                'id': d['id'],
                'label': d['label'],
                'text': sanitized_text(file.read())
            }
        
        # Proceed only if text is not empty
        if len(data['text']) == 0:
            continue
        
        # Export data
        with open(os.path.join(json_output_path, d['id'] + '.json'), 'w') as file:
            json.dump(data, file)

In [8]:
dump_as_json_for_files_with('0')

### 1. XML

In [9]:
def dump_as_xml_for_files_with(n):
    for d in files_with(n):
        
        # Initialzie
        text = '';
        
        # Read data
        with open(d['path'], 'r', encoding='ISO-8859-1') as file:
            text = sanitized_text(file.read())
        
        # Proceed only if text is not empty
        if len('text') == 0:
            continue
        
        # Export data
        with open(os.path.join(xml_output_path, d['id'] + '.xml'), 'w') as file:
            file.write('<body>')
            file.write('<id>')
            file.write(d['id'])
            file.write('</id>')
            file.write('<label>')
            file.write(d['label'])
            file.write('</label>')
            file.write('<text>')
            file.write(text)
            file.write('</text>')
            file.write('</body>')

In [10]:
dump_as_xml_for_files_with('1')

### 2. CSV

In [11]:
def dump_as_csv_for_files_with(n):
    # Initialize
    df = pandas.DataFrame(columns=['id', 'label', 'text'])
    
    for d in files_with(n):
        
        # Initialzie
        data = { };
        
        # Read data
        with open(d['path'], 'r', encoding='ISO-8859-1') as file:
            data = {
                'id': d['id'],
                'label': d['label'],
                'text': sanitized_text(file.read())
            }
        
        # Proceed only if text is not empty
        if len(data['text']) == 0:
            continue
        
        df = df.append(data, ignore_index=True)
    
    # Export data
    df.to_csv(os.path.join(csv_output_path, 'emails-' + n + '.csv'), index=False);

In [12]:
dump_as_csv_for_files_with('2')

### 3. AVRO

In [13]:
# To be transformed into AVRO format using Hive
dump_as_csv_for_files_with('3')

### 4. RCFILE

In [14]:
# To be transformed into RCFILE format using Hive
dump_as_csv_for_files_with('4')

### 5. PARQUET

In [15]:
def dump_as_parquet_for_files_with(n):
    # Initialize
    df = pandas.DataFrame(columns=['id', 'label', 'text'])
    
    for d in files_with(n):
        
        # Initialzie
        data = { };
        
        # Read data
        with open(d['path'], 'r', encoding='ISO-8859-1') as file:
            data = {
                'id': d['id'],
                'label': d['label'],
                'text': sanitized_text(file.read())
            }
        
        # Proceed only if text is not empty
        if len(data['text']) == 0:
            continue
        
        df = df.append(data, ignore_index=True)
    
    # Export data
    df.to_parquet(os.path.join(parquet_output_path, 'emails-' + n + '.parquet'));

In [16]:
dump_as_parquet_for_files_with('5')