In [124]:
import datetime
import json
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import *
from collections import defaultdict
from sklearn.metrics import pairwise_distances
from tqdm.auto import tqdm
import pprint

In [129]:
pd.set_option('display.max_rows', 1000)

In [17]:
def cluster_event_match(labels, preds):
    data = pd.DataFrame()
    data["pred"] = preds
    data["label"] = labels

    match = data.groupby(["label", "pred"], sort=False).size().reset_index(name="a")
    b, c = [], []
    for idx, row in match.iterrows():
        b_ = ((data["label"] != row["label"]) & (data["pred"] == row["pred"]))
        b.append(b_.sum())
        c_ = ((data["label"] == row["label"]) & (data["pred"] != row["pred"]))
        c.append(c_.sum())
    match["b"] = pd.Series(b)
    match["c"] = pd.Series(c)
    # recall = nb true positive / (nb true positive + nb false negative)
    match["r"] = match["a"] / (match["a"] + match["c"])
    # precision = nb true positive / (nb true positive + nb false positive)
    match["p"] = match["a"] / (match["a"] + match["b"])
    match["f1"] = 2 * match["r"] * match["p"] / (match["r"] + match["p"])
    match = match.sort_values("f1", ascending=False)
    # macro_average_f1 = match.drop_duplicates("label").f1.mean()
    # macro_average_precision = match.drop_duplicates("label").p.mean()
    # macro_average_recall = match.drop_duplicates("label").r.mean()

    match = match.drop_duplicates("label")
    sizes = match[['a', 'b', 'c']].sum(axis=1)
    proportions = sizes / sizes.sum()
    micro_average_f1 = (match.f1 * proportions).sum()
    micro_average_precision = (match.p * proportions).sum()
    micro_average_recall = (match.r * proportions).sum()
    return micro_average_precision, micro_average_recall, micro_average_f1, match

In [100]:
def print_statistics(tweet_ids, new_preds2, tweets_with_label):
    results = pd.DataFrame()
    results['tweet_id'] = tweet_ids
    results['pred'] = new_preds2
    tweet_with_label_set = dict(zip(tweets_with_label['tweet_id'].astype(str), tweets_with_label['label']))

    rows = []
    tweets_with_labels = 0
    tweets_without_labels = 0
    label_to_tweets = defaultdict(list)
    all_preds = set(results['tweet_id'])
    for t, c in zip(results['tweet_id'], results['pred']):
        if t in tweet_with_label_set:
            rows.append([t, tweet_with_label_set[t], c])
            tweets_with_labels += 1
            label_to_tweets[tweet_with_label_set[t]].append(t)
    for t in tweet_with_label_set:
        if t not in all_preds:
            rows.append([t, tweet_with_label_set[t], -1])
            tweets_without_labels += 1

    output = pd.DataFrame(rows, columns=['tweet_id', 'label', 'pred'])

    ami = adjusted_mutual_info_score(output['label'], output['pred'])
    ari = adjusted_rand_score(output['label'], output['pred'])
    nmi = normalized_mutual_info_score(output['label'], output['pred'])

    print('adjusted_mutual_info', ami)
    print('adjusted_rand', ari)
    print('Normalized Mutal Info:', nmi)
    macro_p, macro_r, macro_f1, match = cluster_event_match(output['label'], output['pred'])
    print('micro_p:', macro_p)
    print('micro_r:', macro_r)
    print('micro_f1:', macro_f1)
    print('#labeled tweets:', tweets_with_labels, '#unlabeled tweets:', tweets_without_labels)
    print('#labels:', len(set(output['label'])), '#preds:', len(set(output['pred'])))
    return match

In [131]:
results_df = pd.read_csv('ensemble_lsh0-7_kmeans/2012-10-11/events_n_clusters2000.csv')
results_df

Unnamed: 0,tweet_id,label
0,256182325438738432,0.0
1,256182325413543936,115101.0
2,256182325598117889,115101.0
3,256182325451309057,117697.0
4,256182325354844161,115101.0
...,...,...
1540259,256544709412024320,307722.0
1540260,256544709231652864,264069.0
1540261,256544709328138241,264069.0
1540262,256544709332332544,264069.0


In [132]:
start_date_str = '2012-10-11'
end_date_str = '2012-10-12'
tweets_with_label = pd.read_csv(f'event_2012_relevant_tweets_{start_date_str}_{end_date_str}.tsv',
                                sep='\t', header=None, names=['label', 'tweet_id'])

In [133]:
results_df = results_df.drop_duplicates(['tweet_id'], keep='first')
tweet_ids = results_df['tweet_id'].astype(str)
new_preds2 = results_df['label']
print_statistics(tweet_ids, new_preds2, tweets_with_label)




adjusted_mutual_info 0.34971213725274036
adjusted_rand 0.22539874456183873
Normalized Mutal Info: 0.4670552244469603
micro_p: 0.260938794811693
micro_r: 0.5224033672629073
micro_f1: 0.2611243408454601
#labeled tweets: 1739 #unlabeled tweets: 0
#labels: 40 #preds: 209


Unnamed: 0,label,pred,a,b,c,r,p,f1
335,276,242017.0,1,0,0,1.0,1.0,1.0
23,478,113115.0,1,0,0,1.0,1.0,1.0
337,21,264069.0,134,95,11,0.924138,0.585153,0.716578
53,503,126629.0,19,11,5,0.791667,0.633333,0.703704
357,274,288314.0,1,0,1,0.5,1.0,0.666667
173,425,192860.0,2,0,3,0.4,1.0,0.571429
261,83,230208.0,90,154,4,0.957447,0.368852,0.532544
24,77,138992.0,23,35,8,0.741935,0.396552,0.516854
131,106,144038.0,7,0,16,0.304348,1.0,0.466667
158,244,168843.0,7,0,16,0.304348,1.0,0.466667


In [23]:
results_df.groupby('label').count().describe()

Unnamed: 0,tweet_id,text
count,21962.0,21962.0
mean,70.133139,70.133139
std,108.162121,108.162121
min,1.0,1.0
25%,9.0,9.0
50%,44.0,44.0
75%,100.0,100.0
max,3205.0,3205.0


In [134]:
label_count = results_df.groupby('label')['tweet_id'].count().reset_index()
filtered_labels = label_count[label_count['tweet_id'] >= 10]

In [135]:
label_count2 = tweets_with_label.groupby('label')['tweet_id'].count().reset_index()
filtered_labels2 = label_count2[label_count2['tweet_id'] >= 10]

In [136]:
df = results_df.drop_duplicates(['tweet_id'], keep='first')
df = df[df['label'].isin(set(filtered_labels['label']))]
tweet_ids = df['tweet_id'].astype(str)
new_preds2 = df['label']
match = print_statistics(tweet_ids, new_preds2, tweets_with_label[tweets_with_label['label'].isin(set(filtered_labels2['label']))])




adjusted_mutual_info 0.31809909572785794
adjusted_rand 0.20958635088023947
Normalized Mutal Info: 0.3864150514014421
micro_p: 0.2687492301093215
micro_r: 0.4467231737863624
micro_f1: 0.29185557800534195
#labeled tweets: 1512 #unlabeled tweets: 189
#labels: 28 #preds: 40


In [137]:
match

Unnamed: 0,label,pred,a,b,c,r,p,f1
27,503,126629.0,19,9,5,0.791667,0.678571,0.730769
172,21,264069.0,134,93,11,0.924138,0.590308,0.72043
130,83,230208.0,90,153,4,0.957447,0.37037,0.534125
15,77,138992.0,23,34,8,0.741935,0.403509,0.522727
77,244,168843.0,7,0,16,0.304348,1.0,0.466667
64,106,144038.0,7,0,16,0.304348,1.0,0.466667
58,0,140495.0,95,97,152,0.384615,0.494792,0.432802
101,427,212594.0,71,117,70,0.503546,0.37766,0.431611
42,424,118804.0,9,0,40,0.183673,1.0,0.310345
59,419,140495.0,46,146,74,0.383333,0.239583,0.294872


In [138]:
label_tweets = tweets_with_label[tweets_with_label['label'] == 21]['tweet_id']
label_text = results_df[results_df['tweet_id'].isin(set(label_tweets))]
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
label_text

Unnamed: 0,tweet_id,label
1465114,256530226207662080,264069.0
1465780,256530356273045505,264069.0
1465951,256530385624788992,264069.0
1465955,256530389701648385,264069.0
1466048,256530406558539776,264069.0
1466069,256530402435543040,264069.0
1466148,256530419393114112,264069.0
1466151,256530419401494528,264069.0
1466282,256530444420530176,225595.0
1466306,256530448409313280,264069.0


In [112]:
preds = results_df[results_df['label'] == 146423]
preds

Unnamed: 0,tweet_id,label,text
948550,256424089558859776,146423,Appeals court reverses ban on Samsung's Galaxy...
949549,256424278361272320,146423,Court reverses Galaxy Nexus sales ban in the U...
949612,256424294878433283,146423,Apple's US injunction on Samsung Galaxy Nexus ...
950063,256424374750564352,146423,Court reverses Galaxy Nexus sales ban in the U...
950525,256424454501048320,146423,Court reverses Galaxy Nexus sales ban in the U...
...,...,...,...
1084206,256453952743280640,146423,US Appeals Court Overturns Samsung Galaxy Nexu...
1084219,256453952747474944,146423,Appeals Court Overturns Sales Ban on Samsung G...
1084326,256453977833631744,146423,[WIRED_GADGET] U.S. Appeals Court Lifts Galaxy...
1084473,256454003238531072,146423,Gadget Lab: U.S. Appeals Court Lifts Galaxy Ne...


In [116]:
label_text[label_text['tweet_id'].isin(set(preds['tweet_id']))]['text'].tolist()

["Appeals court reverses ban on Samsung's Galaxy Nexus: The latest ruling from an appeals court reverses a decisio... http://t.co/sbx6LqEH",
 'Court reverses Galaxy Nexus sales ban in the US http://t.co/MtEApbcN',
 "Apple's US injunction on Samsung Galaxy Nexus reversed by appeals court http://t.co/u0RrOG1V",
 'Court reverses Galaxy Nexus sales ban in the US:   A United States appeals court has overturned the injunction b... http://t.co/eH579Qeo',
 'Court reverses Galaxy Nexus sales ban in the US:   A United States appeals court has overturned the injunction b... http://t.co/FK8XbWFN',
 'Engadget Court reverses Galaxy Nexus sales ban in the US:   A United States appeals court has overturned the inj... http://t.co/Xj3q9TZW',
 'Court reverses Galaxy Nexus sales ban in the US via @engadget http://t.co/Vgc0mWFn',
 '#TheHapticProject Court reverses Galaxy Nexus sales ban in the US http://t.co/byU3JpFp @engadget',
 'Court reverses Galaxy Nexus sales ban in the US:   A United States appeals c

In [120]:
label_text[~label_text['tweet_id'].isin(set(preds['tweet_id']))]['text'].tolist()

['Samsung Can Resume Selling Galaxy Nexus After Court Reverses Injunction $GOOG by @stevekovach http://t.co/hsPCMbb4',
 'Samsung Can Resume Selling Galaxy Nexus After Court Reverses Injunction (GOOG) http://t.co/AhwiJCvM',
 'Apple’s earned injunction on Samsung Galaxy Nexus reversed by appeals court: The injunction on the Samsung Galax... http://t.co/CbxqnzoE',
 'Apple’s earned injunction on Samsung Galaxy Nexus reversed by appeals court http://t.co/hhcFWmap',
 'Apple’s earned injunction on Samsung Galaxy Nexus reversed by appeals court http://t.co/Eud8p0Uz #tnw',
 '#tech Samsung Can Resume Selling Galaxy Nexus After Court Reverses Injunction http://t.co/xJGuM04F',
 'US appeals court overturns preliminary injunction banning sale of Samsung Galaxy Nexus smartphone http://t.co/J0yIrvD5',
 'Appeals court reverses sales ban on Samsung smartphone: WASHINGTON (Reuters) - A U.S. appeals court overturned a... http://t.co/VuhAmCdE',
 'Appeals court reverses sales ban on Samsung smartphone: WASH

In [99]:
for label, group_df in df.groupby('label'):
    vectorizer = CountVectorizer(stop_words='english', binary=True)
    count_X = vectorizer.fit_transform(group_df['text'])
    sorted_words = count_X.sum(axis=0).A.argsort()
    vectorizer.inverse_transform(sorted_words)[0]
    
    centroid_word_count = np.zeros_like(count_X.A)
    centroid_word_count[:, sorted_words[0][:30]] = 1
    cohesion = jaccard_score(count_X.T, centroid_word_count.T, average='macro')
    print('label:', label, 'cohesion:', cohesion, 'size:', count_X.shape[0])
#     break

label: 60510 cohesion: 0.375 size: 4
label: 60590 cohesion: 0.31666666666666665 size: 4
label: 60618 cohesion: 0.6666666666666666 size: 2
label: 60656 cohesion: 0.537037037037037 size: 2
label: 60702 cohesion: 0.7857142857142857 size: 3
label: 60982 cohesion: 0.543859649122807 size: 3
label: 61025 cohesion: 0.6052631578947368 size: 2
label: 61139 cohesion: 0.463768115942029 size: 3
label: 61200 cohesion: 0.5434782608695652 size: 2
label: 61486 cohesion: 0.8157894736842105 size: 2
label: 61534 cohesion: 0.5384615384615385 size: 2
label: 61550 cohesion: 0.6086956521739131 size: 2
label: 61639 cohesion: 0.7777777777777778 size: 4
label: 61940 cohesion: 0.04329273094752971 size: 18
label: 62011 cohesion: 0.0489778676883119 size: 16
label: 62437 cohesion: 0.11075245285771601 size: 9
label: 62491 cohesion: 0.30434782608695654 size: 4
label: 76143 cohesion: 0.35555555555555557 size: 3
label: 76474 cohesion: 0.07216893728521635 size: 11
label: 76547 cohesion: 0.7307692307692308 size: 2
label: 

label: 179947 cohesion: 0.2386029411764706 size: 5
label: 179964 cohesion: 0.5 size: 2
label: 180163 cohesion: 0.6140350877192983 size: 3
label: 180215 cohesion: 0.4523809523809524 size: 3
label: 180428 cohesion: 0.4010989010989011 size: 13
label: 180477 cohesion: 0.5789473684210527 size: 2
label: 180655 cohesion: 0.5476190476190477 size: 2
label: 180897 cohesion: 0.625 size: 2


In [72]:
sorted_words = count_X.sum(axis=0).A.argsort()
# vectorizer.inverse_transform(sorted_words)[0]
count_X.shape[1]
centroid_word_count = np.zeros_like(count_X.A)
centroid_word_count[:, sorted_words[0][:20]] = 1



In [92]:
jaccard_score(count_X.T, centroid_word_count.T, average='macro')

0.02294023285927256

In [83]:
for row, cent in zip(count_X.A, centroid_word_count):
    a = set(np.where(row)[0])
    b = set(np.where(cent)[0])
    print(len(a&b) / len(a|b))
    break

0.07142857142857142


In [81]:
np.where(cent)

(array([  0, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
        148, 149, 150, 151, 152, 153, 154], dtype=int64),)

In [74]:
centroid_word_count.shape

(30, 219)

In [64]:
sorted_words

array([[  0, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 135, 146,
        148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 147, 134, 133,
        132, 107, 108, 217, 110, 111, 113, 114, 115, 116, 117, 118, 119,
        120, 121, 123, 124, 125, 126, 127, 128, 129, 130, 131, 158, 106,
        159, 161, 192, 193, 194, 195, 196, 198, 199, 200, 201, 202, 191,
        203, 205, 206, 207, 210, 211, 212, 213, 214, 215, 216, 204, 190,
        189, 188, 162, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175,
        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 160,
        105, 109, 103,  28,  29,  30,  31,  33,  34,  35,  36,  37,  38,
         27,  39,  41,  42,  43,  44,  46,  47,  48, 104,  50,  51,  40,
         52,  26,  24,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
         25,  11,  13,  14,  15,  17,  18,  19,  20,  21,  22,  23,  12,
         53,  49,  55,  78,  79,  54,  81,  82,  83,  84,  85,  86,  87,
         77,  88,  90,  92,  93,  94,  95,  97,  98