In [1]:
from os import listdir
from os.path import isfile, join
import sys 
import random
import itertools

import jsonl_parser
import text_preprocess
import text_features
import data_seperation

import pandas as pd
import numpy as np

def load_raw_data(folder_path, trend_list, lang, verbose):
    # all_file = [join(folder_path, f) for f in listdir(folder_path) if isfile(join(folder_path, f))]
    # n_sample = 40

    list_file = [join(folder_path, f + ".jsonl") for f in trend_list if isfile(join(folder_path, f +  ".jsonl"))]

    data = []

    # if n_sample > len(trend_list):
    #     n_sample = len(trend_list)

    # for f in list_file[:n_sample]:
    for f in list_file:
        data.extend(jsonl_parser.load_jsonl(f, verbose))

    print() 

    data_lang = [entry for entry in data if entry['lang'] == lang]

    return data_lang

def print_top_word(model, features_name, num_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " / ".join(features_name[i] for i in topic.argsort()[:-num_words - 1:-1])
        print(message)
    
    print()

## Data Loading

### Load annotated data

In [9]:
trend_type = None

annotated = data_seperation.get_data(trend_type)

trend_type = annotated[['id', 'type']]
trend_hash = annotated['id'].tolist()
trend_name = annotated['name'].tolist()
trend_dict = dict(zip(trend_hash, trend_name))


Traceback (most recent call last):
  File "/Users/khoanguyen/Workspace/git/twitter-trend/jsonl_parser.py", line 12, in load_jsonl
    data.append(json.loads(line.rstrip('\n|\r')))
  File "/Users/khoanguyen/miniconda3/envs/twitter/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/Users/khoanguyen/miniconda3/envs/twitter/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/Users/khoanguyen/miniconda3/envs/twitter/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Unterminated string starting at: line 1 column 662 (char 661)



### Load Tweet Data

In [None]:
data = load_raw_data('/Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset-full/', trend_hash, 'en', False)

df = pd.DataFrame(data)

### Mapping Trend from annotated dataframe into Tweet data 

In [None]:
df['trend_name'] = df['trend_hash'].map(trend_dict)

df['trend_hash'].value_counts()

df['text'] = df['text'].apply(lambda x: text_preprocess.remove_hyperlink(x))

df_trend = df[['text', 'trend_hash', 'trend_name']]

trend_label = text_features.trend_mapping(set(df_trend['trend_hash'].tolist()))

df_trend = (df_trend.merge(trend_type, left_on='trend_hash', right_on='id').reindex(columns=['text', 'trend_hash', 'trend_name', 'type']))

df_trend['label'] = df_trend.apply(lambda row: trend_label[row.trend_hash], axis=1)

### Removing trend with less than 10 tweets (if neccessary)

In [None]:
df_trend = df_trend.groupby('label')
df_trend = df_trend.filter(lambda x: len(x) > 10)

## Text Features 

### Unigram 

In [11]:
tf_uni, features_uni = text_features.generate_term_freq(df['text'], 1)

tf_uni_array = tf_uni.toarray()


### Bigram 

In [5]:
tf_bi, features_bi = text_features.generate_term_freq(df_trend['text'], 2)

tf_bi_array = tf_bi.toarray()


### TF-IDF

In [6]:
tfidf, terms = text_features.generate_tfidf(df_trend['text'])

tfidf_array = tfidf.toarray()

### TF-IDF without removing special character and keep true casing - Loose TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer(lowercase=False,
                                token_pattern=r'\S+',
                                stop_words='english')

loose_tfidf = tfidf_vector.fit_transform(df_trend['text'])

### Latent Dirichlet Allocation - 2gram

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=len(trend_name),
                                    max_iter=10, 
                                    learning_method='online', 
                                    learning_offset=50.,
                                    random_state=0)

lda.fit(tf_bi)

print_top_word(lda, features, 10)

## K-Means clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics

labels = df_trend['label'].tolist()
true_k = len(set(labels))

km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
            verbose=1)
km.fit(tfidf)

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
                    % metrics.adjusted_rand_score(labels, km.labels_))

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s /' % terms[ind], end='')
    print()


## Classification:

### Linear SVM with TFIDF 

In [7]:
from sklearn.svm import LinearSVC


class_name = df_trend['trend_name'].tolist()

cross_validate = 10

entries = []

svc_model = LinearSVC()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(tfidf, class_name, df_trend.index, test_size=0.25, random_state=0, stratify=class_name)
svc_model.fit(X_train, y_train)

y_pred = svc_model.predict(X_test)

from sklearn import metrics

print(metrics.classification_report(y_test, y_pred, target_names=df_trend['trend_name'].unique()))

8      1.00      0.99        42
                  Bulgari       0.97      1.00      0.98        60
                  Diabate       0.97      0.96      0.97       206
                  #tweutr       0.97      0.97      0.97        68
                  Ashthon       1.00      1.00      1.00        28
                 Nia Long       1.00      1.00      1.00        33
            Jurassic Park       0.99      0.99      0.99       180
            Nike Eyebrows       1.00      0.96      0.98        48
   #welcometoirelandbiebs       0.99      1.00      1.00       127
               Eat Bulaga       0.93      0.68      0.79        19
                Simon Rex       1.00      0.98      0.99        62
                    Papac       0.98      1.00      0.99       252
        Two Movie Tickets       0.97      0.98      0.97       213
                 Hartnell       1.00      0.98      0.99        46
                   Carice       1.00      1.00      1.00        49
               #biwinning     

### Cross 10-fold validation

In [11]:
from sklearn.model_selection import cross_val_score

cfv_model = LinearSVC()

class_name = df_trend['trend_name'].tolist()

cross_validate = 10

entries = []

accuracies = cross_val_score(cfv_model, tfidf, class_name, scoring='accuracy', cv=cross_validate)

for fold_idx, accuracy in enumerate(accuracies):
    entries.append((fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['fold_idx', 'accuracy'])

print(cv_df.accuracy.mean())

0.9401641108662446


### Naive Bayes with TFIDF

In [22]:
from sklearn.naive_bayes import MultinomialNB

MNB_model = MultinomialNB()
MNB_model.fit(X_train, y_train)

y_pred = MNB_model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=df_trend['trend_name'].unique()))

0      0.36      0.53        42
                  Bulgari       1.00      0.73      0.85        60
                  Diabate       0.86      0.98      0.91       206
                  #tweutr       0.98      0.91      0.95        68
                  Ashthon       1.00      0.46      0.63        28
                 Nia Long       1.00      0.79      0.88        33
            Jurassic Park       0.95      0.99      0.97       180
            Nike Eyebrows       1.00      0.52      0.68        48
   #welcometoirelandbiebs       0.96      0.97      0.96       127
               Eat Bulaga       0.00      0.00      0.00        19
                Simon Rex       1.00      0.84      0.91        62
                    Papac       0.83      1.00      0.91       252
        Two Movie Tickets       0.82      0.98      0.90       213
                 Hartnell       1.00      0.24      0.39        46
                   Carice       1.00      0.90      0.95        49
               #biwinning     

### Cross 10-fold validation

In [23]:
accuracies = cross_val_score(MNB_model, tfidf, class_name, scoring='accuracy', cv=cross_validate)

for fold_idx, accuracy in enumerate(accuracies):
    entries.append((fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['fold_idx', 'accuracy'])

print(cv_df.accuracy.mean())

0.837088716505825


### Linear SVM with loose TF-IDF

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

class_name = df_trend['trend_name'].tolist()

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(loose_tfidf, class_name, df_trend.index, test_size=0.25, random_state=0, stratify=class_name)

from sklearn.svm import LinearSVC
new_svc_model = LinearSVC()

new_svc_model.fit(X_train, y_train)

y_pred = new_svc_model.predict(X_test)

from sklearn import metrics

print(metrics.classification_report(y_test, y_pred, target_names=df_trend['trend_name'].unique()))

4      0.69      0.79        42
                  Bulgari       0.95      0.95      0.95        60
                  Diabate       0.97      0.90      0.93       206
                  #tweutr       0.97      0.97      0.97        68
                  Ashthon       0.97      1.00      0.98        28
                 Nia Long       1.00      1.00      1.00        33
            Jurassic Park       1.00      0.98      0.99       180
            Nike Eyebrows       0.92      0.94      0.93        48
   #welcometoirelandbiebs       0.95      0.97      0.96       127
               Eat Bulaga       0.79      0.58      0.67        19
                Simon Rex       1.00      0.92      0.96        62
                    Papac       0.98      0.98      0.98       252
        Two Movie Tickets       0.95      0.98      0.96       213
                 Hartnell       1.00      0.93      0.97        46
                   Carice       0.98      1.00      0.99        49
               #biwinning     

### Multinomial Naive Bayes with loose TF-IDF

In [7]:
from sklearn.naive_bayes import MultinomialNB

new_MNB_model = MultinomialNB()
new_MNB_model.fit(X_train, y_train)

y_pred = new_MNB_model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=df_trend['trend_name'].unique()))

0      0.12      0.21        42
                  Bulgari       1.00      0.52      0.68        60
                  Diabate       0.79      0.94      0.86       206
                  #tweutr       0.96      0.78      0.86        68
                  Ashthon       1.00      0.25      0.40        28
                 Nia Long       1.00      0.61      0.75        33
            Jurassic Park       0.89      0.97      0.93       180
            Nike Eyebrows       1.00      0.15      0.25        48
   #welcometoirelandbiebs       0.98      0.84      0.91       127
               Eat Bulaga       0.00      0.00      0.00        19
                Simon Rex       1.00      0.76      0.86        62
                    Papac       0.68      0.98      0.80       252
        Two Movie Tickets       0.83      0.98      0.89       213
                 Hartnell       1.00      0.09      0.16        46
                   Carice       1.00      0.69      0.82        49
               #biwinning     