In [57]:
import numpy as np
import pandas as pd
import re
import string
from collections import defaultdict
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans

In [2]:
emails = pd.read_csv('hillary-clinton-emails/Emails.csv')
receivers = pd.read_csv('hillary-clinton-emails/EmailReceivers.csv')
persons = pd.read_csv('hillary-clinton-emails/Persons.csv')
aliases = pd.read_csv('hillary-clinton-emails/Aliases.csv')

In [3]:
emails.columns

Index(['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'MetadataFrom',
       'SenderPersonId', 'MetadataDateSent', 'MetadataDateReleased',
       'MetadataPdfLink', 'MetadataCaseNumber', 'MetadataDocumentClass',
       'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom', 'ExtractedCc',
       'ExtractedDateSent', 'ExtractedCaseNumber', 'ExtractedDocNumber',
       'ExtractedDateReleased', 'ExtractedReleaseInPartOrFull',
       'ExtractedBodyText', 'RawText'],
      dtype='object')

## Предобработка текста

Все тексты содержатся в колонках ExtractedBodyText и RawText. В ExtractedBodyText -- предобработанные создателями датасета тексты из RawText, но в них нет forward-ов и attachment-ов, поэтому лучше будем работать с текстами из колонки RawText, убрав из них заголовки писем и пометки US State Department, а также удалив пустые строки и пунктуацию.

In [4]:
raw_email_texts = emails['RawText'].dropna()

In [5]:
def remove_state_dept_info(email, axis):
    email = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', email)
    email = re.sub(r'UNCLASS.*?NO FOIA WAIVER\..(RELEASE IN.?(FULL|PART)?|STATE-\w+)?','',
                   email, flags=re.DOTALL)
    email = re.sub(r'^UNCLASSIFIED.*$', '', email, flags=re.MULTILINE)
    email = re.sub(r'^RELEASE IN (FULL|PART).*$', '', email, flags=re.MULTILINE)
    return email

In [6]:
def remove_email_header(email, axis):
    email = re.sub(r'^(From|Re|Sent|To|Cc|cc|CC|Importance|For|Attachments|Subject|Fw|Fwd).*$', '',
                   email, flags=re.MULTILINE)
    email = re.sub(r'B\d','',email)
    return email

In [7]:
def remove_empty_lines_and_punctuation(email, axis):
    email = '\n'.join([s for s in email.splitlines() if s])
    return ''.join([ch for ch in email if ch not in set(string.punctuation)])

In [8]:
email_texts_prepared = raw_email_texts.apply(remove_state_dept_info, axis=0)
email_texts_prepared = email_texts_prepared.apply(remove_email_header, axis=0)
email_texts_prepared = email_texts_prepared.apply(remove_empty_lines_and_punctuation, axis=0)

In [9]:
email_texts_prepared[3500]

'1213\n5pm\nSECRETARY OF STATE HILLARY RODHAM CLINTON\nTILE HUMAN RIGHTS AGENDA FOR THE 21sT CENTURY\nGEORGETOWN UNIVERSITY\nWASHINGTON DC\nDECEMBER 14 2009\n1\n2\n3\n4\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n1\n16\n17\n\n18'

In [10]:
bigrams = defaultdict(int)
for email in email_texts_prepared:
    tokens = re.split(r'\s+', email)
    if len(tokens) > 1:
        token1, token2 = tokens[0], tokens[1]
        bigrams[(token1, token2)] += 1
        for token in tokens[2:]:
            token1 = token2
            token2 = token
            bigrams[(token1, token2)] += 1

In [11]:
sorted_bigrams = list(sorted(bigrams.items(), key=lambda x: -x[1]))

In [56]:
sorted_bigrams[:30]

[(('of', 'the'), 13497),
 (('in', 'the'), 9082),
 (('Original', 'Message'), 7190),
 (('to', 'the'), 6197),
 (('on', 'the'), 4716),
 (('and', 'the'), 4060),
 (('for', 'the'), 3854),
 (('that', 'the'), 3546),
 (('with', 'the'), 3084),
 (('to', 'be'), 2977),
 (('at', 'the'), 2526),
 (('will', 'be'), 2357),
 (('by', 'the'), 2289),
 (('the', 'US'), 2172),
 (('is', 'a'), 2083),
 (('from', 'the'), 2055),
 (('United', 'States'), 1894),
 (('of', 'a'), 1828),
 (('the', 'United'), 1762),
 (('as', 'a'), 1754),
 (('has', 'been'), 1711),
 (('in', 'a'), 1638),
 (('is', 'the'), 1454),
 (('would', 'be'), 1418),
 (('for', 'a'), 1396),
 (('it', 'is'), 1353),
 (('as', 'the'), 1307),
 (('have', 'been'), 1239),
 (('want', 'to'), 1236),
 (('of', 'State'), 1235)]

In [51]:
all_words = []
for email in email_texts_prepared:
    tokens = re.split(r'\s+', email)
    all_words.extend([token for token in tokens if len(token) > 0])

In [54]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(all_words)
finder.apply_freq_filter(120)

In [55]:
finder.nbest(bigram_measures.pmi, 30) 

[('Sinn', 'Fein'),
 ('AGIS', 'DoS'),
 ('OFFICE', 'TIME'),
 ('DAS', 'AGIS'),
 ('Human', 'Rights'),
 ('Source', 'Comment'),
 ('Saudi', 'Arabia'),
 ('per', 'cent'),
 ('Northern', 'Ireland'),
 ('Lona', 'Valmoro'),
 ('Private', 'Residence'),
 ('Sullivan', 'Jacob'),
 ('SECRETARY', 'CLINTON'),
 ('Class', 'CONFIDENTIAL'),
 ('Reason', '14B'),
 ('prime', 'minister'),
 ('En', 'route'),
 ('D', 'MillsCDstategov'),
 ('Jacob', 'J'),
 ('14B', '14D'),
 ('Street', 'NW'),
 ('Tea', 'Party'),
 ('Tony', 'Blair'),
 ('Middle', 'East'),
 ('civil', 'society'),
 ('North', 'Korea'),
 ('West', 'Bank'),
 ('Special', 'Assistant'),
 ('Operations', 'Center'),
 ('sensitive', 'source')]

In [58]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(email_texts_prepared.values)

In [65]:
names = vectorizer.get_feature_names()

In [88]:
k_means = KMeans(n_clusters=15)
k_means.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=15, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [92]:
pred_labels = k_means.predict(X)

In [108]:
for class_label in range(15):
    print('Class {}:'.format(class_label))
    class_members = email_texts_prepared.values[pred_labels==class_label]
    chosen = np.random.choice(np.arange(class_members.shape[0]))
    print(class_members[chosen])

Class 0:

Class 1:
Saturday April 11 2009 659 PM
Delivered Any news
detailstxt
Your message was delivered to the recipient
Class 2:
The position is not clearly defined
14B14D
Fyi below
Mills Cheryl D MillsCDstategov
Tuesday February 23 2010 738 AM
FW Rio GroupHaiti
Classified by DAS AGIS DoS on 08272015 — Class CONFIDENTIAL — Reason 14B 14D —
Declassify on 02232020
Original Message
Thomas A
14B
1 4C
Original Message
Thomas A
14B
14D
JulissaCheryl Per Foreign Minister Fernandez
Original Message
Simons Paul E Santiago
14D
Including Tom Shannon and Paul Simons who know more
Original Message 
What is the position To whom does it report What is the function
cdm
Original Message
Julissa
Class 3:
RELEASE IN
FULL
When am I supposed to call Libyans
Original Message
Class 4:
Jerusalem Post Prime Minister Netanyahu met with Likud MKs and deputy ministers to discuss the US
proposal for a 90day construction freeze November 21 In the meeting Netanyahu told the Likud officials I
have a responsibility

the to in of original message and for on is this will you draft at
delivered recipient your was message the to detailstxt pverveer saturday april 11 659 2009 pm
14d 14b agis declassify das dos class classified on confidential reason 14b14d 08272015 the by
release full in part original message call you to tomorrow will can the at talk
the to israel of palestinian israeli in and said palestinians talks jerusalem netanyahu that peace
the of to and in on for is that with we said be will as
to you original message call the can and will for me do we in if
abedin huma pm 2009 2010 abedinhstategov hrod17clintonemailcom am sullivan sunday sullivanjjstategov jacob message original to
pm secretarys am office room depart arrive en route residence meeting private department state daily
original message to ok you call can do the and for him will we it
the to and you of in is this for we on that it message with
he to the and his in is that you of original message him was on
the honduras zelaya to and