In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.collocations import *
from nltk.tokenize import *
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD

import matplotlib.pyplot as plt
%matplotlib inline

### a, b

In [2]:
aliases = pd.read_csv('Aliases.csv')
email_receivers = pd.read_csv('EmailReceivers.csv')
emails = pd.read_csv('Emails.csv')
persons = pd.read_csv('Persons.csv')

In [4]:
aliases.head()

Unnamed: 0,Id,Alias,PersonId
0,1,111th congress,1
1,2,agna usemb kabul afghanistan,2
2,3,ap,3
3,4,asuncion,4
4,5,alec,5


In [5]:
email_receivers.head()

Unnamed: 0,Id,EmailId,PersonId
0,1,1,80
1,2,2,80
2,3,3,228
3,4,3,80
4,5,4,80


In [6]:
emails.head(3)

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\nU.S. Department of State\nCase N...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",UNCLASSIFIED\nU.S. Department of State\nCase N...
2,3,C05739547,CHRIS STEVENS,;H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739547...,F-2015-04841,...,B6,"Mills, Cheryl D <MillsCD@state.gov>","Abedin, Huma","Wednesday, September 12, 2012 11:52 AM",F-2015-04841,C05739547,05/14/2015,RELEASE IN PART,Thx,UNCLASSIFIED\nU.S. Department of State\nCase N...


In [9]:
print(emails.RawText[1])

UNCLASSIFIED
U.S. Department of State
Case No. F-2015-04841
Doc No. C05739546
Date: 05/13/2015
STATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM.
SUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER.
RELEASE IN
PART B6
From:
Sent:
To:
Subject:
Attachments:
B6
Thursday, March 3, 2011 9:45 PM
H: Latest How Syria is aiding Qaddafi and more... Sid
hrc memo syria aiding libya 030311.docx; hrc memo syria aiding libya 030311.docx
CONFIDENTIAL
March 3, 2011
For: Hillary
From: Sid
Re: Syria aiding Qaddafi
This memo has two parts. Part one is the report that Syria is providing air support for Qaddafi. Part two is a note
to Cody from Lord David Owen, former UK foreign secretary on his views of an increasingly complex crisis. It
seems that the situation is developing into a protracted civil war with various nations backing opposing sides
with unforeseen consequences. Under these circumstances the crucial challenge is to deprive Qaddafi of his
strategic depth—his support bo

### c. 

Предобработка: приводим к нижнему регистру слова(чтобы уменьшить количество слов), убираем метаданные письма.

In [14]:
emails.columns

Index(['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'MetadataFrom',
       'SenderPersonId', 'MetadataDateSent', 'MetadataDateReleased',
       'MetadataPdfLink', 'MetadataCaseNumber', 'MetadataDocumentClass',
       'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom', 'ExtractedCc',
       'ExtractedDateSent', 'ExtractedCaseNumber', 'ExtractedDocNumber',
       'ExtractedDateReleased', 'ExtractedReleaseInPartOrFull',
       'ExtractedBodyText', 'RawText'],
      dtype='object')

In [3]:
metadata_prefixes = ('unclassified', 'case no', 'doc no', 'date', 'state', 'subject to agreement', 'release in', \
                    'part', 'from', 'sent', 'to', 'subject:', 'attachments', 'u.s. department of state', \
                     'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', \
                    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'semptember', \
                    'october', 'november', 'december', \
                    'for:', 'from:', 're:', '\\x0')

In [4]:
emails['Cleaned_text'] = emails['RawText'].apply(lambda raw_text: 
                " ".join(list(filter(lambda s: not s.startswith(metadata_prefixes), raw_text.lower().split('\n')))))

In [5]:
emails.Cleaned_text[1]



### d.

In [56]:
vectorizer = CountVectorizer(ngram_range=(2,2))
document_bigrams_matrix = vectorizer.fit_transform(emails.Cleaned_text)

In [54]:
most_frequent_idx = np.argmax(bigram_document_matrix.sum(axis=0), axis=1)

In [60]:
{i: b for b, i in vectorizer.vocabulary_.items()}[most_frequent_idx[0, 0]]

'of the'

### e.

In [80]:
finder = BigramCollocationFinder.from_words(wordpunct_tokenize("\t\n".join(emails.Cleaned_text)), window_size=10)

In [77]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

[('-mukhabarat', 'el-khabeya'),
 ('15/10/10', '18:22'),
 ('1865', '281447'),
 ('2.ahmad', 'bukatela'),
 ('2ccm-', '86804491_1261527777'),
 ('3.muhammad', 'al-zahawi'),
 ('432', '1660'),
 ('451', '4.4.1'),
 ('abhisit', 'vejjajiva'),
 ('abuzaid', 'dorda')]

Вместе встречаются имя и фамилия.

### f.

In [10]:
vectorizer = TfidfVectorizer(max_features=2000)
corpora = vectorizer.fit_transform(emails.Cleaned_text)

In [None]:
tsne = TSNE()
X_tsne = tsne.fit_transform(corpora.todense())

In [None]:
kmeans_model = KMeans(n_clusters=3)
y = kmeans_model.fit_predict(corpora)

In [None]:
color_dict = plt.cm.hot(np.linspace(0.3, 0.9, 3))
plt.figure()
plt.scatter(X_tsne[:,0], X_tsne[:,1], c=color_dict[y])
plt.show()

### g.

### 1.

Решающее дерево на разреженном пространстве очень большой размерности. Пространство очень разреженное, поэтому при построение дерева в минибатче значения признака могут быть все одинаковые и дерево будет переобучаться.

### 2.

kfold: 20 20 20 20 15/5 
kfold: 19/1 19/1 19/1 19/1 19/1
accuracy:
1) 0.95: 0.75, 0.95 * 4 0.