Install Data.World

$ pip install git+git://github.com/datadotworld/data.world-py.git

Then, go to https://data.world/settings/advanced and get your API token
    
Then simply run the following in Terminal and enter one time Token

$ dw configure

In [1]:
import datadotworld as dw
import pandas as pd
import numpy as np
import pprint as pp
import os

In [2]:
#Load Meta Data from data.world
dataset = dw.load_dataset('sketchcity/city-of-houston-email-metadata-january-march-2017')
pp.pprint(dataset.describe())

{'description': 'Metadata from houstontx.gov email addresses\n'
                '\n'
                'Data includes:\n'
                '- Sender\n'
                '- To\n'
                '- CC\n'
                '- BCC\n'
                '- Sent date/time\n'
                '- Received date/time\n'
                '\n'
                'Assembled by Matt Chapman (hubblefisher@gmail.com) through '
                'FOIA request.',
 'homepage': 'https://data.world/sketchcity/city-of-houston-email-metadata-january-march-2017',
 'license': 'Public Domain',
 'name': 'sketchcity_city-of-houston-email-metadata-january-march-2017',
 'resources': [{'bytes': 1370801133,
                'format': 'csv',
                'mediatype': 'text/csv',
                'name': 'original/coh_email_metadata_1Q17.csv',
                'path': 'original/coh_email_metadata_1Q17.csv'}],
 'title': 'City of Houston Email Metadata: 1/1/17 - 2/23/17'}


In [3]:
#Import data
from io import BytesIO
import csv
df = pd.read_csv(BytesIO(dataset.raw_data['original/coh_email_metadata_1Q17.csv']))[0:1000000]

In [4]:
#View Data size, Null value counts and sample data
print(df.shape)
print(df.isnull().sum())
df.head()

(1000000, 6)
Sender           4
To            7158
CC          856428
BCC         971424
Sent            27
Received        27
dtype: int64


Unnamed: 0,Sender,To,CC,BCC,Sent,Received
0,Houston Parks and Recreation Department <Houst...,"Lathan, Debra - PRD <Debra.Lathan@houstontx.gov>",,,2017-02-14 06:55:54Z,2017-02-14 06:55:52Z
1,Houston Parks and Recreation Department <Houst...,"Lathan, Debra - PRD <Debra.Lathan@houstontx.gov>",,,2017-02-14 06:55:54Z,2017-02-14 06:55:52Z
2,"DIEP, BA <Ba.Diep@tsa.dhs.gov>",HOUcoordinationcenter <HOUcoordinationcenter@t...,,,2017-02-14 06:57:22Z,2017-02-14 06:57:31Z
3,has.nicealerts@houstontx.gov <has.nicealerts@h...,"Ulrich, David - HAS <David.Ulrich@houstontx.gov>",,,2017-02-14 06:57:54Z,2017-02-14 06:57:54Z
4,has.movi@has.net <has.movi@has.net>,"Lueders, Ulf - HAS <Ulf.Lueders@houstontx.gov>",,,2017-02-14 06:58:01Z,2017-02-14 06:58:01Z


In [5]:
#Delete Null rows & SystemMailboxMails in relevant columns
df.dropna(subset=['Sender','To','Sent','Received'], how='any', inplace=True)
df = df[df.Sender.str.contains("SystemMailbox") == False]
df = df[df.To.str.contains("SystemMailbox") == False]
df = df.reset_index(drop=True)
print(df.shape)
print(df.isnull().sum())

(986317, 6)
Sender           0
To               0
CC          843262
BCC         967868
Sent             0
Received         0
dtype: int64


In [6]:
#Dropping rows with incorrect times(some rows contain email ids in time columns) and deleting time columns
df = df.drop(df.index[[x for x in range(df.shape[0]) if (df['Sent'][x][:5]!='2017-' or df['Received'][x][:5]!='2017-')]]).reset_index(drop=True)
df.drop(['Sent', 'Received'], axis=1)
df.shape

(986290, 6)

In [7]:
#creating emails column containing all email ids present in an email
df['Emails'] = df[df.columns[0:4]].apply(lambda x: ','.join(x.astype(str)),axis=1)
#Extracting email ids as list in each row
import re
df['Emails'] = df['Emails'].apply(lambda x: re.findall(r'<(.*?)\>', x))

In [23]:
df['Emails']

0         [HoustonParksandRecreationDepartment@active.co...
1         [HoustonParksandRecreationDepartment@active.co...
2         [Ba.Diep@tsa.dhs.gov, HOUcoordinationcenter@ts...
3         [has.nicealerts@houstontx.gov, David.Ulrich@ho...
4             [has.movi@has.net, Ulf.Lueders@houstontx.gov]
5         [Francis.Roberto@houstontx.gov, Oscar.Segura@h...
6             [has.movi@has.net, Ulf.Lueders@houstontx.gov]
7             [btinfo@btol.com, Esther.Reyes@houstontx.gov]
8               [btinfo@btol.com, Meryl.Bote@houstontx.gov]
9              [btinfo@btol.com, Marla.Garza@houstontx.gov]
10                [btinfo@btol.com, Pat.Ogea@houstontx.gov]
11        [Old6WardHouston@yahoogroups.com, old6wardhous...
12        [passenger.terminal.expo@mail.ukipmemails.com,...
13        [Emily.Schaefer@houstontx.gov, daryl.h@houston...
14         [info@houstonlibrary.org, hplblog@houstontx.gov]
15        [Emily.Schaefer@houstontx.gov, daryl.h@houston...
16            [has.movi@has.net, Ulf.Lue