In [1]:
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from vncorenlp import VnCoreNLP
from unicodedata import normalize as unl
from sklearn.feature_extraction.text import (
    TfidfVectorizer,
    CountVectorizer,
    TfidfTransformer
)

In [2]:
annotator = VnCoreNLP(
    '/home/phucpx/vinbdi/Data-Interpretation/models/VnCoreNLP/VnCoreNLP-1.1.1.jar',
    annotators="wseg",
    max_heap_size='-Xmx2g'
)

In [3]:
def clean_text(text):
    text = unl('NFKC', text)
    seg_text = annotator.tokenize(text)

    list_words = []
    for sent in seg_text:
        for w in sent:
            list_words.append(w)

    ntext = ' '.join(list_words)
    
    #TODO: Clean text
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

    ntext = emoji_pattern.sub(r'', ntext) #Removing emojis
    
    return ntext.lower().strip()

In [4]:
with open('./data/data_merged_0308_fixed_capu_fixed_syserr_2-RAW-fixed.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    
sentences = data['document']['sentences']

docs = [clean_text(sent['content']) for sent in sentences]

len(docs)

23990

In [5]:
count_vectorizer = CountVectorizer(min_df=5)
wm = count_vectorizer.fit_transform(docs)

In [6]:
doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
feat_names = count_vectorizer.get_feature_names()



In [7]:
sample_df = pd.DataFrame(data=wm.toarray(), index=doc_names,columns=feat_names)
sample_df

Unnamed: 0,00,000,01,02,0đ,10,100,1000,1005,1007,...,ầm_ầm,ẩn,ổn,ổn_áp,ổn_định,ủa,ủng_hộ,ức_chế,ứng,ứng_dụng
Doc0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc23985,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc23986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc23987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc23988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
tfidf_vectorizer = TfidfVectorizer(min_df=5)

tfidfm = tfidf_vectorizer.fit_transform(docs)

In [9]:
doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(tfidfm)]
feat_names = tfidf_vectorizer.get_feature_names()



In [10]:
sample_df = pd.DataFrame(data=tfidfm.toarray(), index=doc_names, columns=feat_names)

sample_df

Unnamed: 0,00,000,01,02,0đ,10,100,1000,1005,1007,...,ầm_ầm,ẩn,ổn,ổn_áp,ổn_định,ủa,ủng_hộ,ức_chế,ứng,ứng_dụng
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc23985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc23986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc23987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc23988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
first_doc_vector = tfidfm[6]

df = pd.DataFrame(first_doc_vector.T.todense(), index=feat_names, columns=["tfidf"]) 
dff = df.sort_values(by=["tfidf"], ascending=False)

dff.head(20)

Unnamed: 0,tfidf
cần,0.453013
nhieu,0.407048
nhà_phát_triển,0.377377
nâng_cấp,0.31591
hệ_thống,0.302533
đơ,0.297919
lúc,0.244516
bị,0.183033
đc,0.176298
vào,0.169541


In [12]:
first_doc_vector[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [13]:
len(feat_names)

2762

In [14]:
def get_label(instance):
    tags = instance['tags']

    if not tags:
        return []

    opi_terms = []

    for tag in tags:
        if not tag['polarity']:
            return []

        opi_terms.append(tag['target'])

    return opi_terms

In [15]:
with open('./data/data_merged_0308_fixed_capu_fixed_syserr_2-RAW-fixed.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

    texts, terms = [], []

    sentences = data['document']['sentences']
    for sent in tqdm(sentences, desc="Processing"):
        term = get_label(sent)
        try:
            text = sent['content'].lower()
            if term:
                term = [t.lower() for t in term]
            texts.append(text)
            terms.append(term)
        except:
            print(term)

Processing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23990/23990 [00:00<00:00, 612197.25it/s]


In [16]:
THRESHOLD = 0.3

statis_terms = []

for i in tqdm(range(len(texts))):
    vector = tfidfm[i].toarray()[0]
    term = []
    
    for j in range(len(vector)):
        if vector[j] > THRESHOLD:
            a = feat_names[j].replace('_', ' ').split()
            term.extend(a)
            
    statis_terms.append(term)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23990/23990 [00:09<00:00, 2481.39it/s]


In [17]:
termsx = []

for te in terms:
    term = []
    for t in te:
        term.extend(t.split())
    
    termsx.append(term)

In [48]:
def calculate_scores(texts, statis_terms, terms):
    scores = []

    for i in tqdm(range(len(texts)), desc="Calculate scores"):
        unique_union = []
        statis = statis_terms[i]
        target = terms[i]

        unique_union.extend(statis)
        unique_union.extend(target)

        max_intersection = list(set(statis) & set(target))

        if len(target) > 0:
            scores.append(len(max_intersection) / len(list(set(unique_union))))

    return np.mean(scores), scores

In [49]:
scores, _ = calculate_scores(texts, statis_terms, termsx)

Calculate scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23990/23990 [00:00<00:00, 395343.92it/s]


In [50]:
scores

0.4095408775658988