In [1]:
from os import listdir
from os.path import isfile, join
import sys 
import random
import itertools

import jsonl_parser
import text_preprocess
import text_features
import data_seperation

import pandas as pd
import numpy as np

def load_raw_data(folder_path, trend_list, lang, verbose):
    # all_file = [join(folder_path, f) for f in listdir(folder_path) if isfile(join(folder_path, f))]
    # n_sample = 40

    list_file = [join(folder_path, f + ".jsonl") for f in trend_list if isfile(join(folder_path, f +  ".jsonl"))]

    data = []

    # if n_sample > len(trend_list):
    #     n_sample = len(trend_list)

    # for f in list_file[:n_sample]:
    for f in list_file:
        data.extend(jsonl_parser.load_jsonl(f, verbose))

    print() 

    data_lang = [entry for entry in data if entry['lang'] == lang]

    return data_lang

def print_top_word(model, features_name, num_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " / ".join(features_name[i] for i in topic.argsort()[:-num_words - 1:-1])
        print(message)
    
    print()

## Data Loading

### Load annotated data

In [9]:
trend_type = None

annotated = data_seperation.get_data(trend_type)

trend_type = annotated[['id', 'type']]
trend_hash = annotated['id'].tolist()
trend_name = annotated['name'].tolist()
trend_dict = dict(zip(trend_hash, trend_name))


Traceback (most recent call last):
  File "/Users/khoanguyen/Workspace/git/twitter-trend/jsonl_parser.py", line 12, in load_jsonl
    data.append(json.loads(line.rstrip('\n|\r')))
  File "/Users/khoanguyen/miniconda3/envs/twitter/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/Users/khoanguyen/miniconda3/envs/twitter/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/Users/khoanguyen/miniconda3/envs/twitter/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Unterminated string starting at: line 1 column 662 (char 661)



### Load Tweet Data

In [None]:
data = load_raw_data('/Users/khoanguyen/Workspace/dataset/twitter-trending/TT-classification/dataset-full/', trend_hash, 'en', False)

df = pd.DataFrame(data)

### Mapping Trend from annotated dataframe into Tweet data 

In [None]:
df['trend_name'] = df['trend_hash'].map(trend_dict)

df['trend_hash'].value_counts()

df['text'] = df['text'].apply(lambda x: text_preprocess.remove_hyperlink(x))

df_trend = df[['text', 'trend_hash', 'trend_name']]

trend_label = text_features.trend_mapping(set(df_trend['trend_hash'].tolist()))

df_trend = (df_trend.merge(trend_type, left_on='trend_hash', right_on='id').reindex(columns=['text', 'trend_hash', 'trend_name', 'type']))

df_trend['label'] = df_trend.apply(lambda row: trend_label[row.trend_hash], axis=1)

### Removing trend with less than 10 tweets (if neccessary)

In [None]:
df_trend = df_trend.groupby('label')
df_trend = df_trend.filter(lambda x: len(x) > 10)

## Text Features 

### Unigram 

In [11]:
tf_uni, features_uni = text_features.generate_term_freq(df['text'], 1)

tf_uni_array = tf_uni.toarray()


### Bigram 

In [5]:
tf_bi, features_bi = text_features.generate_term_freq(df_trend['text'], 2)

tf_bi_array = tf_bi.toarray()


### TF-IDF

In [6]:
tfidf, terms = text_features.generate_tfidf(df_trend['text'])

tfidf_array = tfidf.toarray()

### TF-IDF without removing special character and keep true casing - Loose TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer(lowercase=False,
                                token_pattern=r'\S+',
                                stop_words='english')

loose_tfidf = tfidf_vector.fit_transform(df_trend['text'])

## Tweet Topology Classification

### Linear SVM using loose TF-IDF

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

class_name = df_trend['type'].tolist()

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(loose_tfidf, class_name, df_trend.index, test_size=0.25, random_state=0, stratify=class_name)

from sklearn.svm import LinearSVC
new_svc_model = LinearSVC()

new_svc_model.fit(X_train, y_train)

y_pred = new_svc_model.predict(X_test)

from sklearn import metrics

print(metrics.classification_report(y_test, y_pred, target_names=df_trend['type'].unique()))

               precision    recall  f1-score   support

ongoing-event       0.99      0.96      0.98      2524
         meme       0.99      0.96      0.97     18618
         news       0.98      0.96      0.97     18757
commemorative       0.97      0.99      0.98     44677

     accuracy                           0.98     84576
    macro avg       0.98      0.97      0.97     84576
 weighted avg       0.98      0.98      0.98     84576



### Mutinominal Nayes Bay with loose TF-IDF

In [12]:
from sklearn.naive_bayes import MultinomialNB

new_MNB_model = MultinomialNB()
new_MNB_model.fit(X_train, y_train)

y_pred = new_MNB_model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=df_trend['type'].unique()))

               precision    recall  f1-score   support

ongoing-event       1.00      0.38      0.55      2524
         meme       0.98      0.80      0.88     18618
         news       0.97      0.81      0.88     18757
commemorative       0.84      0.99      0.91     44677

     accuracy                           0.89     84576
    macro avg       0.95      0.75      0.81     84576
 weighted avg       0.91      0.89      0.89     84576

