## Python statistics essential training - 03_08_email

Standard imports

In [1]:
import numpy as np
import scipy.stats
import pandas as pd

In [2]:
import matplotlib
import matplotlib.pyplot as pp

import pandas.plotting

from IPython import display
from ipywidgets import interact, widgets

%matplotlib inline

In [3]:
import re
import mailbox
import csv

### How I converted my mailbox.

In [4]:
mbox = mailbox.mbox('Sent.mbox')

The resulting object is array-like, with one entry per message. Each entry is dictionary like, with keys corresponding to metadata and data for each message.

In [5]:
mbox[0].keys()

KeyError: 'No message with key: 0'

The easiest way to get these data into Pandas is to build a CSV file from them. We use the module `csv` to write out the CSV file as we loop over the mailbox object. We save only subject, from, to, and date, and we write a simple header at the top with the names of columns.

In [None]:
with open('mbox.csv', 'w') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['subject','from','to','date'])
    
    for message in mbox:
        writer.writerow([message['subject'], message['from'], message['to'], message['date']])

All done! Thanks to Justin Ellis for inspiration with https://jellis18.github.io/post/2018-01-17-mail-analysis.

## Moving on!

In [6]:
messages = pd.read_csv('mbox-anonymized.csv')

In [7]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  1030 non-null   object
 1   from     1029 non-null   object
 2   to       1030 non-null   object
 3   date     1030 non-null   object
dtypes: object(4)
memory usage: 32.3+ KB


In [8]:
messages.head()

Unnamed: 0,subject,from,to,date
0,Why control quickly exactly capital.,"""Vallisneri, Michele (335S)"" <Michele.Vallisne...","""Trevor Charles"" <zrodriguez@hotmail.com>","Mon, 27 Nov 2017 14:18:46 -0800"
1,Work evidence from really threat sign store see.,Michele Vallisneri <[]>,"""Heather Hernandez"" <patriciagarcia@garcia-car...","Mon, 26 Mar 2018 18:38:16 -0700"
2,Lot where answer the law person.,"""Vallisneri, Michele (335S)"" <michele.vallisne...",Bryce Dudley <hailey63@frazier.com>,"Tue, 13 Feb 2018 22:54:50 +0000"
3,Difference hotel yard.,"""Vallisneri, Michele (335S)"" <Michele.Vallisne...",Kathryn Mathis <josephrebecca@grimes.com>,"Fri, 26 Jan 2018 11:40:04 -0800"
4,Long level mission energy candidate.,"""Vallisneri, Michele (335S)"" <michele.vallisne...","""Angela Mendoza"" <rkhan@yahoo.com>","Tue, 31 Oct 2017 00:13:00 +0000"


In [9]:
messages['from'][0]

'"Vallisneri, Michele (335S)" <Michele.Vallisneri@jpl.nasa.gov>'

In [10]:
re.search('<(.+)>',messages['from'][0])

<re.Match object; span=(29, 62), match='<Michele.Vallisneri@jpl.nasa.gov>'>

In [11]:
re.search('<(.+)>',messages['from'][0]).group(0)

'<Michele.Vallisneri@jpl.nasa.gov>'

In [12]:
re.search('<(.+)>',messages['from'][0]).group(1)

'Michele.Vallisneri@jpl.nasa.gov'

In [13]:
re.search('<(.+)>','Michele.Vallisneri@jpl.nasa.gov').group(1)

AttributeError: 'NoneType' object has no attribute 'group'

In [14]:
def clean_address(raw):
    match = re.search('<(.+)>',raw)
    
    if match is None:
        return raw
    else:
        return match.group(1)

In [15]:
clean_address(messages['from'][0])

'Michele.Vallisneri@jpl.nasa.gov'

In [16]:
messages['from'] = messages['from'].apply(clean_address)

TypeError: expected string or bytes-like object

In [None]:
%debug

> [0;32m/Users/craston/opt/anaconda3/lib/python3.7/re.py[0m(185)[0;36msearch[0;34m()[0m
[0;32m    183 [0;31m    """Scan through string looking for a match to the pattern, returning
[0m[0;32m    184 [0;31m    a Match object, or None if no match was found."""
[0m[0;32m--> 185 [0;31m    [0;32mreturn[0m [0m_compile[0m[0;34m([0m[0mpattern[0m[0;34m,[0m [0mflags[0m[0;34m)[0m[0;34m.[0m[0msearch[0m[0;34m([0m[0mstring[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    186 [0;31m[0;34m[0m[0m
[0m[0;32m    187 [0;31m[0;32mdef[0m [0msub[0m[0;34m([0m[0mpattern[0m[0;34m,[0m [0mrepl[0m[0;34m,[0m [0mstring[0m[0;34m,[0m [0mcount[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mflags[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> print(string)
nan


In [None]:
messages['from'] = messages['from'].dropna().apply(clean_address)
messages['to'] = messages['to'].dropna().apply(clean_address)

In [None]:
messages.head()

In [None]:
messages['date'][0]

In [None]:
pd.to_datetime(messages['date'][0]).tz_localize('UTC').tz_convert('America/Los_Angeles')

In [None]:
messages['date'] = messages['date'].apply(lambda s: pd.to_datetime(s).tz_localize('UTC').tz_convert('America/Los_Angeles'))

In [None]:
messages.date.head()

In [None]:
messages.date.min(), messages.date.max()

In [None]:
messages.date.dt.weekday_name.head()

In [None]:
messages['dayofweek'] = pd.Categorical(messages['date'].dt.weekday_name,
                                       ordered=True,
                                       categories=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])

In [None]:
messages['timeofday'] = messages['date'].dt.hour + messages['date'].dt.minute / 60

In [None]:
messages['nyear'] = messages['date'].dt.year + messages['date'].dt.dayofyear/365.25 + messages['timeofday']/24/365.25 

In [None]:
messages.plot.scatter('nyear','timeofday',s=2)

In [None]:
messages.nyear.hist()

In [None]:
messages.timeofday.hist()

In [None]:
messages.dayofweek.value_counts()

In [None]:
counts = messages.dayofweek.value_counts(sort=False)
counts.plot(kind='bar')