In [None]:
from os import listdir
from os.path import isfile, join
import sys 
import random
import itertools

import jsonl_parser
import text_preprocess
import text_features
import data_seperation

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn import metrics

def load_raw_data(folder_path, trend_list, lang):
    # all_file = [join(folder_path, f) for f in listdir(folder_path) if isfile(join(folder_path, f))]
    n_sample = 40

    list_file = [join(folder_path, f + ".jsonl") for f in trend_list if isfile(join(folder_path, f +  ".jsonl"))]

    data = []

    if n_sample > len(trend_list):
        n_sample = len(trend_list)

    for f in list_file[:n_sample]:
        data.extend(jsonl_parser.load_jsonl(f))

    print() 

    data_lang = [entry for entry in data if entry['lang'] == lang]

    return data_lang

def print_top_word(model, features_name, num_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " / ".join(features_name[i] for i in topic.argsort()[:-num_words - 1:-1])
        print(message)
    
    print()

## Data Loading

In [None]:
annotated = data_seperation.get_data('news')

trend_list = annotated['id'].tolist()

data = load_raw_data('/Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset/', trend_list, 'en')

df = pd.DataFrame(data)

df['trend_hash'].value_counts()

df['text'] = df['text'].apply(lambda x: text_preprocess.remove_hyperlink(x))

df_trend = df[['text', 'trend_hash']]

trend_label = text_features.trend_mapping(set(df_trend['trend_hash'].tolist()))

df_trend['label'] = df_trend.apply(lambda row: trend_label[row.trend_hash], axis=1)

## Latent Dirichlet Allocation Topic Modeling

### Unigram feature

In [None]:
tf, features = text_features.generate_term_freq(df['text'], 1)

tf_uni_array = tf.toarray()

lda = LatentDirichletAllocation(n_components=22,
                                    max_iter=10, 
                                    learning_method='online', 
                                    learning_offset=50.,
                                    random_state=0)

lda.fit(tf)

print_top_word(lda, features, 10)

### Bigram feature 

In [None]:
tf, features = text_features.generate_term_freq(df['text'], 2)

tf_bi_array = tf.toarray()

lda = LatentDirichletAllocation(n_components=22,
                                    max_iter=10, 
                                    learning_method='online', 
                                    learning_offset=50.,
                                    random_state=0)

lda.fit(tf)

print_top_word(lda, features, 10)

## K-Means clustering

In [None]:
labels = df_trend['label'].tolist()
true_k = len(set(labels))

tfidf, terms = text_features.generate_tfidf(df['text'])

tfidf_array = tfidf.toarray()

km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
            verbose=1)
km.fit(tfidf)

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
                    % metrics.adjusted_rand_score(labels, km.labels_))

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s /' % terms[ind], end='')
    print()
