# Tạo Dictionary và Corpus

In [17]:
# pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import pandas as pd
import numpy as np
np.random.seed(400)

In [18]:
df_train = pd.read_csv('https://raw.githubusercontent.com/namvnvx/DS107_Project/main/Demo_data/train_data.csv')

X_train = df_train['tweet'].values
y_train = df_train['class'].values

In [19]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
import pandas as pd
stemmer = SnowballStemmer("english")

In [21]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def pre_process(data):
    final = []
    for text in data:
      result=[]
      for token in gensim.utils.simple_preprocess(text) :
          if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
              result.append(lemmatize_stemming(token))
      final.append(result)

    return final

In [22]:
# tạo tập các word
document_word = pre_process(X_train)
# tạo từ điển
dictionary = gensim.corpora.Dictionary(document_word)

In [23]:
# trước khi lọc từ điển
print(dictionary.token2id)



In [24]:
''' hãy chạy dòng này để lọc data (xuất hiện ko ít hơn trong 15 dòng, tỉ lệ ko dưới 10%, giữ 100000 word)'''
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [25]:
# sau khi lọc từ điển
print(dictionary.token2id)

{'attempt': 0, 'balanc': 1, 'chang': 2, 'crisi': 3, 'recoveri': 4, 'someth': 5, 'viru': 6, 'write': 7, 'away': 8, 'elain': 9, 'mike': 10, 'news': 11, 'penni': 12, 'read': 13, 'stop': 14, 'counter': 15, 'time': 16, 'advic': 17, 'assist': 18, 'attorney': 19, 'avail': 20, 'hotlin': 21, 'iowa': 22, 'legal': 23, 'question': 24, 'diamond': 25, 'follow': 26, 'million': 27, 'remov': 28, 'twitter': 29, 'mayb': 30, 'mouth': 31, 'prevent': 32, 'prove': 33, 'shoot': 34, 'treat': 35, 'fact': 36, 'medium': 37, 'outlet': 38, 'tell': 39, 'mean': 40, 'posit': 41, 'test': 42, 'censor': 43, 'govern': 44, 'scientist': 45, 'stori': 46, 'turn': 47, 'institut': 48, 'work': 49, 'world': 50, 'death': 51, 'alert': 52, 'column': 53, 'futur': 54, 'mail': 55, 'post': 56, 'think': 57, 'today': 58, 'western': 59, 'advantag': 60, 'andrew': 61, 'budget': 62, 'control': 63, 'cuomo': 64, 'forc': 65, 'seiz': 66, 'state': 67, 'take': 68, 'total': 69, 'anyon': 70, 'donald': 71, 'know': 72, 'invit': 73, 'join': 74, 'result'

▶ sau khi lọc từ điển đã giảm rất nhiều những gì có tỉ lệ xuất hiện dưới 10% và không quá 15 câu

In [26]:
# tạo Bag_of_word
bow_corpus = [dictionary.doc2bow(doc) for doc in document_word]

In [27]:
# biểu diễn mẫu bag_of_word
bow_doc_x = bow_corpus[0]

for i in range(len(bow_doc_x)):
    print(f"Word {bow_doc_x[i][0]} (\"{dictionary[bow_doc_x[i][0]]}\") appears {bow_doc_x[i][1]} time.")

Word 0 ("attempt") appears 1 time.
Word 1 ("balanc") appears 1 time.
Word 2 ("chang") appears 1 time.
Word 3 ("crisi") appears 1 time.
Word 4 ("recoveri") appears 1 time.
Word 5 ("someth") appears 1 time.
Word 6 ("viru") appears 1 time.
Word 7 ("write") appears 1 time.


In [28]:
''' lưu corpus và dictionary thành file '''
import pickle
pickle.dump(bow_corpus, open('bow_corpus.pkl', 'wb'))

dictionary.save('dictionary.gensim')

# Vector hóa dữ liệu và Phân cụm (Clustering)

In [29]:
# Vectorizer
sequences = [dictionary.doc2idx(text) for text in document_word]
sequences

[[7, 5, 3, 2, 4, 0, 1, -1, 6],
 [10, 12, 9, -1, 8, 14, 13, 11],
 [16, 15, -1, -1],
 [19, 17, 22, 23, 21, 20, 18, -1, 24],
 [25, -1, 11, 28, 29, 27, 26],
 [30, -1, 34, -1, -1, 31, 33, 32, 35],
 [39, 37, -1, 36, -1, -1, 11, 38],
 [42, 41, -1, 40],
 [13, 46, 44, -1, 43, 17, 45, 47],
 [49, 48, 50, -1],
 [51, -1],
 [57, 56, 54, 58, 59, 55, 53, -1, 52],
 [61, 64, 68, 60, -1, 3, 66, 69, 63, 67, 62, 65],
 [70, 72, -1, 71],
 [76, 73, 74, 77, 75, -1],
 [-1, 78, 79],
 [-1, 82, 83, -1, 84, -1, 81, -1, 80],
 [90, 87, 92, 85, 88, 92, -1, 94, 93, 86, 89, 91],
 [50, -1],
 [95, 96, -1, -1, 95, 97, 98],
 [102, 6, -1, 99, 101, 32, 100],
 [-1, 104, 106, 105, 42, 41, -1, 103],
 [111, 108, -1, 112, 110, 109, 107],
 [49, 114, 113, 115, 8, 6, 116],
 [-1, 118, 117],
 [119, 121, 122, 123, 34, 120, -1, -1, 119, 42, 41],
 [128, 129, 124, 127, 126, -1, 125],
 [130, 105, 133, 42, 134, 132, 131, -1, 135, -1],
 [141, 139, 142, 138, 137, 136, 140],
 [145, 146, 143, 58, 39, 16, 147, -1, 144, 148, -1, -1],
 [152, 149, 1

In [30]:
# Padding - thêm 0 để mỗi câu có đủ độ dài
from keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(sequences, padding='post')
padded_sequences

array([[   7,    5,    3, ...,    0,    0,    0],
       [  10,   12,    9, ...,    0,    0,    0],
       [  16,   15,   -1, ...,    0,    0,    0],
       ...,
       [ 434, 1495, 4013, ...,    0,    0,    0],
       [3358,   72,   -1, ...,    0,    0,    0],
       [ 193,   35,  131, ...,    0,    0,    0]], dtype=int32)

In [31]:
vectorized_data = pd.DataFrame(data=padded_sequences, index=None)
vectorized_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,7,5,3,2,4,0,1,-1,6,0,0,0,0,0,0,0,0,0,0
1,10,12,9,-1,8,14,13,11,0,0,0,0,0,0,0,0,0,0,0
2,16,15,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,19,17,22,23,21,20,18,-1,24,0,0,0,0,0,0,0,0,0,0
4,25,-1,11,28,29,27,26,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114730,690,11,231,11,158,9,1165,36,-1,0,0,0,0,0,0,0,0,0,0
114731,621,1457,195,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
114732,434,1495,4013,498,801,314,75,0,0,0,0,0,0,0,0,0,0,0,0
114733,3358,72,-1,1850,1252,150,-1,1280,0,0,0,0,0,0,0,0,0,0,0


In [32]:
# lenght check after padding
max = len(padded_sequences[0])
for vectorized in padded_sequences:
  if len(vectorized) > max:
    max = len(vectorized)

max

19

In [33]:
# Final data
y_1 = pd.DataFrame(data=y_train, columns=['class'])
result = pd.concat([vectorized_data,y_1], axis=1)
result

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,class
0,7,5,3,2,4,0,1,-1,6,0,0,0,0,0,0,0,0,0,0,0
1,10,12,9,-1,8,14,13,11,0,0,0,0,0,0,0,0,0,0,0,0
2,16,15,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,19,17,22,23,21,20,18,-1,24,0,0,0,0,0,0,0,0,0,0,2
4,25,-1,11,28,29,27,26,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114730,690,11,231,11,158,9,1165,36,-1,0,0,0,0,0,0,0,0,0,0,0
114731,621,1457,195,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
114732,434,1495,4013,498,801,314,75,0,0,0,0,0,0,0,0,0,0,0,0,1
114733,3358,72,-1,1850,1252,150,-1,1280,0,0,0,0,0,0,0,0,0,0,0,0


In [34]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)
kmeans_fit = kmeans.fit(vectorized_data)
labels = kmeans.predict(vectorized_data)
centroids = kmeans.cluster_centers_



In [35]:
a = pd.DataFrame(data=labels)
a.columns = ['cluster']
df_a = pd.DataFrame(data=result)
result = pd.concat([df_a, a], axis=1)

In [36]:
result

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,class,cluster
0,7,5,3,2,4,0,1,-1,6,0,...,0,0,0,0,0,0,0,0,0,0
1,10,12,9,-1,8,14,13,11,0,0,...,0,0,0,0,0,0,0,0,0,0
2,16,15,-1,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,19,17,22,23,21,20,18,-1,24,0,...,0,0,0,0,0,0,0,0,2,0
4,25,-1,11,28,29,27,26,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114730,690,11,231,11,158,9,1165,36,-1,0,...,0,0,0,0,0,0,0,0,0,0
114731,621,1457,195,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
114732,434,1495,4013,498,801,314,75,0,0,0,...,0,0,0,0,0,0,0,0,1,4
114733,3358,72,-1,1850,1252,150,-1,1280,0,0,...,0,0,0,0,0,0,0,0,0,1


In [37]:
a['cluster'].unique()

array([0, 2, 4, 1, 3], dtype=int32)

In [38]:
#Tỉ lệ phân cụm
lenght = len(result['cluster'])
for i in a['cluster'].unique():
  count = len(result.loc[result['cluster'] == i])
  rate = (count / lenght)*100
  print(f'\n_{i}_ : {count}/{lenght} row --- {rate:.3f} %' )


_0_ : 46200/114735 row --- 40.267 %

_2_ : 24108/114735 row --- 21.012 %

_4_ : 14711/114735 row --- 12.822 %

_1_ : 15001/114735 row --- 13.074 %

_3_ : 14715/114735 row --- 12.825 %


In [39]:
df = pd.DataFrame(data=result)
df.to_csv('Vectorized_Clustered_TrainData.csv')