In [2]:
from os import listdir
from os.path import isfile, join
import sys 
import random
import itertools

import jsonl_parser
import text_preprocess
import text_features
import data_seperation

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn import metrics

def load_raw_data(folder_path, trend_list, lang):
    # all_file = [join(folder_path, f) for f in listdir(folder_path) if isfile(join(folder_path, f))]
    n_sample = 40

    list_file = [join(folder_path, f + ".jsonl") for f in trend_list if isfile(join(folder_path, f +  ".jsonl"))]

    data = []

    if n_sample > len(trend_list):
        n_sample = len(trend_list)

    for f in list_file[:n_sample]:
        data.extend(jsonl_parser.load_jsonl(f))

    print() 

    data_lang = [entry for entry in data if entry['lang'] == lang]

    return data_lang

def print_top_word(model, features_name, num_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " / ".join(features_name[i] for i in topic.argsort()[:-num_words - 1:-1])
        print(message)
    
    print()

## Data Loading

In [3]:
annotated = data_seperation.get_data('news')

trend_list = annotated['id'].tolist()

data = load_raw_data('/Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset/', trend_list, 'en')

df = pd.DataFrame(data)

df['trend_hash'].value_counts()

df['text'] = df['text'].apply(lambda x: text_preprocess.remove_hyperlink(x))

df_trend = df[['text', 'trend_hash']]

trend_label = text_features.trend_mapping(set(df_trend['trend_hash'].tolist()))

df_trend['label'] = df_trend.apply(lambda row: trend_label[row.trend_hash], axis=1)

Loaded 269 records from /Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset/02a6571d75e06298c8fb794f4a4eb707.jsonl
Loaded 677 records from /Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset/083b78b643ed8ee8b3dd55a5f3560828.jsonl
Loaded 995 records from /Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset/01901b07f92e9c67ce8c3bde06b23e79.jsonl
Loaded 1039 records from /Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset/026ab72f6d6042233ee6c1662237edcd.jsonl
Loaded 1157 records from /Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset/1569f8f32be7ae04898403c550b47b82.jsonl
Loaded 415 records from /Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset/1566af974068cee541d4b8112637586f.jsonl
Loaded 775 records from /Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset/18db5565f70fc1e2abed15cfca8bde15.jsonl
Load

## Latent Dirichlet Allocation Topic Modeling

### Unigram feature

In [8]:
tf, features = text_features.generate_term_freq(df['text'], 1)

tf_uni_array = tf.toarray()

lda = LatentDirichletAllocation(n_components=22,
                                    max_iter=10, 
                                    learning_method='online', 
                                    learning_offset=50.,
                                    random_state=0)

lda.fit(tf)

print_top_word(lda, features, 10)

Topic #0: corinne / rae / bailey / rt / 2011 / javajazz2011 / jjf / santana / paris / fourplay
Topic #1: warner / bros / rt / tmz / bad / francis / following / issued / stateme / lawrence
Topic #2: just / balenciaga / rt / know / going / don / oh / want / got / ll
Topic #3: balenciaga / time / fashion / really / miranda / kerr / week / face / lol / little
Topic #4: great / eric / cantona / day / big / pele / star / rt / amazing / football
Topic #5: blade / runner / video / love / tomorrow / rt / william / director / music / cut
Topic #6: royal / faces / balenciaga / spring / long / hit / jacket / late / lambskin / magazine
Topic #7: smart / covers / ipad / ipad2 / garageband / rt / imovie / case / cameras / thinner
Topic #8: ap / says / live / rt / team / end / press / manager / united / night
Topic #9: best / bag / think / photo / np / links / ron / summer / picture / online
Topic #10: fandango / movie / tickets / wedding / deal / livingsocial / social / living / deals / tix
Topic #11

### Bigram feature 

In [9]:
tf, features = text_features.generate_term_freq(df['text'], 2)

tf_bi_array = tf.toarray()

lda = LatentDirichletAllocation(n_components=22,
                                    max_iter=10, 
                                    learning_method='online', 
                                    learning_offset=50.,
                                    random_state=0)

lda.fit(tf)

print_top_word(lda, features, 10)

Topic #0: apple announced / today apple / announced new / new apps / ipad today / apps ipad / ipad run / run th / orlando bloom / imovie garageband
Topic #1: blade runner / warner bros / sheen half / just fired / bros just / fired charlie / tmz warner / rt tmz / men warner / issued following
Topic #2: movie tickets / tickets fandango / fandango movie / living social / livingsocial fandango / social deal / deal fandango / fandango livingsocial / social fandango / dealsplus movie
Topic #3: barry bannan / end season / bannan loan / leeds united / new ipad / balenciaga spain / loan leeds / prince andrew / season avfc / spain symposium
Topic #4: prince andrew / wedding nears / royal wedding / faces woes / andrew faces / woes royal / distraction wedding / andrew woes / trade role / woes distraction
Topic #5: blade runner / rt justinbieber / directors cut / weekend going / justinbieber tomorrow / just weekend / nsn3d theatres / theatres just / tomorrow time / cut nsn3d
Topic #6: william hague

## K-Means clustering

In [13]:
labels = df_trend['label'].tolist()
true_k = len(set(labels))

tfidf, terms = text_features.generate_tfidf(df['text'])

tfidf_array = tfidf.toarray()

km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
            verbose=1)
km.fit(tfidf)

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
                    % metrics.adjusted_rand_score(labels, km.labels_))

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s /' % terms[ind], end='')
    print()


Initialization complete
Iteration 0, inertia 10721.41939090688
Iteration 1, inertia 6062.335294125611
Iteration 2, inertia 5927.968285357919
Iteration 3, inertia 5887.125945254181
Iteration 4, inertia 5880.284696633037
Iteration 5, inertia 5878.813430121415
Iteration 6, inertia 5878.1954664912455
Iteration 7, inertia 5877.467910555596
Iteration 8, inertia 5876.960196059383
Iteration 9, inertia 5876.531074959638
Iteration 10, inertia 5876.488878892703
Iteration 11, inertia 5876.457545525773
Iteration 12, inertia 5876.418531502592
Iteration 13, inertia 5876.401203582092
Iteration 14, inertia 5876.3899629706375
Iteration 15, inertia 5876.361774533491
Iteration 16, inertia 5876.321374530544
Iteration 17, inertia 5876.3187199693475
Converged at iteration 17: strict convergence.
Homogeneity: 0.837
Completeness: 0.763
V-measure: 0.799
Adjusted Rand-Index: 0.578
Cluster 0: mike / destefano / comedian / rip / rt / sad / punchlinemag / oh / missed / died /
Cluster 1: bros / warner / tmz / issued