In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
WORKING_DIR = '.' #os.path.dirname(__file__)
DATA_DIR = os.path.join(WORKING_DIR, 'data', 'hillary-clinton-emails')
EMAILS_FILE = os.path.join(DATA_DIR, 'Emails.csv')
EMAILS_RECEIVERS_FILE = os.path.join(DATA_DIR, 'EmailReceivers.csv')

## A-B
Загрузите датасет с kaggle: https://www.kaggle.com/kaggle/hillary-clinton-emails
Изучите, из чего состоит датасет.

**Aliases.csv** - список контактов Хиллари Клинтон

**EmailReceivers.csv** - соответствие писем и отправителей

**Emails.csv** - текст письма и его характеристики

**Persons.csv** - имена отправителей

## C
Предобработайте тексты как сочтете правильным для первых экспериментов. Опишите, как вы его предобрабатываете, и почему так в блокноте в markdown ячейке

Кажется, что в любом письме важны в первую очередь получатель, отправитель. Поэтому возьмем их в качестве признаков вместе с текстами. (Заголовки пригодятся ниже для анализа.)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
def get_df():
    df_emails = pd.read_csv(EMAILS_FILE, sep=',', encoding='utf-8')
    df_emails_receivers = pd.read_csv(EMAILS_RECEIVERS_FILE, sep=',', encoding='utf-8')
    # Чистим пропуски
    df_emails = df_emails.dropna(subset=['SenderPersonId','ExtractedBodyText','ExtractedSubject'])
    receivers = []
    for id in df_emails.Id:
        ids = df_emails_receivers.loc[df_emails_receivers.EmailId == id].PersonId.values
        if len(ids) == 0:
            receivers.append(-1)
        else:
            receivers.append(ids[0])
    return pd.DataFrame({
            'sender': list(map(int, df_emails.SenderPersonId)),
            'receiver': receivers,
            'subject': df_emails.ExtractedSubject,
            'text': df_emails.ExtractedBodyText
        })

In [5]:
df = get_df()
df.head()

Unnamed: 0,receiver,sender,subject,text
2,228,32,Re: Chris Stevens,Thx
5,185,80,Meet The Right Wing Extremist Behind Anti-Musl...,Pis print.\n-•-...-^\nH < hrod17@clintonernail...
8,80,87,FVV: Secretary's remarks,FYI
10,80,87,AbZ and Hb3 on Libya and West Bank/Gaza,Fyi\nB6\n— —
12,80,87,hey,Fyi


Немного пофильтруем тексты

In [6]:
import re
df.text = [re.sub('[^a-zA-Z0-9]', ' ', text) for text in df.text]
df.head()

Unnamed: 0,receiver,sender,subject,text
2,228,32,Re: Chris Stevens,Thx
5,185,80,Meet The Right Wing Extremist Behind Anti-Musl...,Pis print H hrod17 clintonernailco...
8,80,87,FVV: Secretary's remarks,FYI
10,80,87,AbZ and Hb3 on Libya and West Bank/Gaza,Fyi B6
12,80,87,hey,Fyi


## D
Выясните, какие биграммы чаще всего встречаются в датасете

In [9]:
import operator

bigram_vectorizer = CountVectorizer(ngram_range=(2,2), stop_words = 'english', min_df = 10).fit(df.text)
sorted(bigram_vectorizer.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)[:10]

[('york times', 442),
 ('york ny', 441),
 ('york new', 440),
 ('years ago', 439),
 ('year old', 438),
 ('xpress mail', 437),
 ('world war', 436),
 ('women issues', 435),
 ('white house', 434),
 ('west bank', 433)]

## E
Попробуйте выделить коллокации из двух слов по PMI с помощью nltk (примеры можно найти по ссылке: http://www.nltk.org/howto/collocations.html)

In [10]:
joined_text = ' '.join(df.text)

In [11]:
import nltk.collocations
from nltk.tokenize import word_tokenize

bgm = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(word_tokenize(joined_text))
finder.apply_freq_filter(3)
finder.nbest(bgm.pmi, 10)

[('029', 'mo'),
 ('ABOUL', 'GHEIT'),
 ('Appointment', 'Affidavit'),
 ('Buenos', 'Aires'),
 ('CHIEF', 'NEGOTIATOR'),
 ('Cardinal', 'Stadium'),
 ('Ensuring', 'Connections'),
 ('Fixed', 'LendingTree'),
 ('GUIDO', 'WESTERWELLE'),
 ('JASSIM', 'JABR')]

## F + H
Выполните любую несложную кластеризацию писем (не тратьте на этот шаг много времени)
Поработайте с признаками и методом кластеризации так, чтобы кластеры выглядели наиболее интерпретируемыми.

In [12]:
def extract_features(df_raw):
    X_raw = TfidfVectorizer(max_df=500, min_df=10).fit_transform(df_raw.text)
    X_raw = np.hstack((X_raw.toarray(), df.sender.values.reshape(X_raw.shape[0],1)))
    X_raw =  np.hstack((X_raw, df.receiver.values.reshape(X_raw.shape[0],1)))
    return X_raw

In [13]:
X = extract_features(df)
X.shape

(5414, 3133)

In [14]:
from sklearn.cluster import KMeans
CLUSTERS_COUNT = 6

model = model = KMeans(n_clusters=CLUSTERS_COUNT, random_state=1)
preds = model.fit_predict(X)

## G
Придумайте, как визуализировать содержание кластеров. Например, можно выводить самые частые слова из каждого кластера (но, вероятно, это не самая удачная идея). Визуализируйте ту кластеризацию, которая у вас уже получилась.

In [15]:
df_preds = df.copy()
df_preds['preds'] = preds

In [16]:
' '.join([str(df_preds[df_preds.preds == i].shape[0]) for i in range(CLUSTERS_COUNT)])

'2931 291 656 1066 371 99'

Посмотрим на тексты

In [35]:
top_dict = dict()
for i in range(CLUSTERS_COUNT):
    unigram_vectorizer = CountVectorizer(stop_words = 'english').fit(df_preds[df_preds.preds == i].text)
    word_counts = sorted(unigram_vectorizer.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)[:10]
    top_dict[i] = word_counts
df_top = pd.DataFrame(top_dict)
df_top

Unnamed: 0,0,1,2,3,4,5
0,"(zwire, 10306)","(zi, 2118)","(zx, 16244)","(zzity, 7766)","(zia, 2259)","(zurich, 1040)"
1,"(zuma, 10305)","(zealand, 2117)","(zumbi, 16243)","(zone, 7765)","(zelaya, 2258)","(youth, 1039)"
2,"(zuckerman, 10304)","(younger, 2116)","(zuma, 16242)","(zipping, 7764)","(youths, 2257)","(young, 1038)"
3,"(zones, 10303)","(young, 2115)","(zulciernain, 16241)","(zevon, 7763)","(youth, 2256)","(yes, 1037)"
4,"(zone, 10302)","(yesterday, 2114)","(zones, 16240)","(zeleya, 7762)","(younger, 2255)","(years, 1036)"
5,"(zintan, 10301)","(yes, 2113)","(zone, 16239)","(zela, 7761)","(york, 2254)","(year, 1035)"
6,"(zimbabwean, 10300)","(years, 2112)","(zobmbz, 16238)","(zein, 7760)","(yonight, 2253)","(yang, 1034)"
7,"(zia, 10299)","(year, 2111)","(ziyang, 16237)","(zealand, 7759)","(yohannes, 2252)","(wsjournal, 1033)"
8,"(zhu, 10298)","(yeah, 2110)","(zionists, 16236)","(zartman, 7758)","(yesterday, 2251)","(wrote, 1032)"
9,"(zhou, 10297)","(xmas, 2109)","(zionist, 16235)","(yuval, 7757)","(yesss, 2250)","(wrong, 1031)"


In [29]:
top_dict = dict()
for i in range(CLUSTERS_COUNT):
    bigram_vectorizer = CountVectorizer(ngram_range=(2,2), stop_words = 'english').fit(df_preds[df_preds.preds == i].text)
    word_counts = sorted(bigram_vectorizer.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)[:10]
    top_dict[i] = word_counts
df_top = pd.DataFrame(top_dict)
df_top

Unnamed: 0,0,1,2,3,4,5
0,"(zwire com, 46192)","(zi a79, 4094)","(zx htmi, 83688)","(zzity ii, 25042)","(zia use, 4357)","(zurich lisle, 1614)"
1,"(zuma making, 46191)","(zealand pacific, 4093)","(zumbi dos, 83687)","(zone years, 25041)","(zelaya people, 4356)","(youth behavior, 1613)"
2,"(zuma jibril, 46190)","(younger brother, 4092)","(zuma add, 83686)","(zone north, 25040)","(youths treat, 4355)","(young abused, 1612)"
3,"(zuckerman ed, 46189)","(young woman, 4091)","(zulciernain tahir, 83685)","(zone authorized, 25039)","(youths times, 4354)","(yes stephens, 1611)"
4,"(zuckerman dinner, 46188)","(yesterday young, 4090)","(zones schools, 83684)","(zipping state, 25038)","(youths detained, 4353)","(years happened, 1610)"
5,"(zones aware, 46187)","(yesterday second, 4089)","(zone weeks, 83683)","(zevon wonderful, 25037)","(youths color, 4352)","(years earlier, 1609)"
6,"(zone taken, 46186)","(yesterday process, 4088)","(zone suicide, 83682)","(zeleya unpersuadable, 25036)","(youth won, 4351)","(years case, 1608)"
7,"(zone support, 46185)","(yesterday internal, 4087)","(zone security, 83681)","(zeleya going, 25035)","(youth rehabilitation, 4350)","(years ann, 1607)"
8,"(zone region, 46184)","(yesterday effective, 4086)","(zone said, 83680)","(zeleya appeared, 25034)","(youth happen, 4349)","(year scoresheet, 1606)"
9,"(zone libya, 46183)","(yes ii, 4085)","(zone possible, 83679)","(zela bar, 25033)","(youth development, 4348)","(year looking, 1605)"


Посмотрим на заголовки писем

In [20]:
top_dict = dict()
for i in range(CLUSTERS_COUNT):
    unigram_vectorizer = CountVectorizer(stop_words = 'english').fit(df_preds[df_preds.preds == i].subject)
    word_counts = sorted(unigram_vectorizer.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)[:10]
    top_dict[i] = word_counts
df_top = pd.DataFrame(top_dict)
df_top

Unnamed: 0,0,1,2,3,4,5
0,"(zuckerman, 2017)","(zelaya, 484)","(zak, 894)","(zones, 1590)","(zones, 709)","(zealand, 175)"
1,"(zest, 2016)","(youknow, 483)","(youth, 893)","(zelikow, 1589)","(yohannes, 708)","(year, 174)"
2,"(zelaya, 2015)","(yoga, 482)","(youknow, 892)","(zelaya, 1588)","(year, 707)","(yeah, 173)"
3,"(zebari, 2014)","(year, 481)","(year, 891)","(youtube, 1587)","(www, 706)","(ye, 172)"
4,"(zardari, 2013)","(xmas, 480)","(xmas, 890)","(young, 1586)","(wsj, 705)","(yang, 171)"
5,"(zach, 2012)","(wpost, 479)","(wyden, 889)","(yohannes, 1585)","(wrong, 704)","(xo, 170)"
6,"(yu, 2011)","(work, 478)","(wsj, 888)","(yesterday, 1584)","(wpost, 703)","(world, 169)"
7,"(young, 2010)","(wjc, 477)","(wpost, 887)","(yeo, 1583)","(worst, 702)","(wjc, 168)"
8,"(york, 2009)","(wishing, 476)","(world, 886)","(year, 1582)","(world, 701)","(views, 167)"
9,"(ynet, 2008)","(wing, 475)","(workers, 885)","(yaryura, 1581)","(work, 700)","(video, 166)"
