<a href="https://colab.research.google.com/github/panghanwu/machine_learning_Elwing/blob/main/embedding_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

語言是人類在有意及無意之間訂定出來溝通方式，衍伸出抽象的概念及情緒含意，而難以有系統性的歸類。使用深度學習訓練出詞向量（word vector）模型是一種有效的逼近方法。

詞向量維度 128~512

In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [2]:
import glob

glob.glob('/root/.keras/datasets/aclImdb/train/pos/*')[0:6]

['/root/.keras/datasets/aclImdb/train/pos/987_8.txt',
 '/root/.keras/datasets/aclImdb/train/pos/8053_8.txt',
 '/root/.keras/datasets/aclImdb/train/pos/3319_9.txt',
 '/root/.keras/datasets/aclImdb/train/pos/4298_10.txt',
 '/root/.keras/datasets/aclImdb/train/pos/2363_7.txt',
 '/root/.keras/datasets/aclImdb/train/pos/12424_9.txt']

In [3]:
import os
import pandas as pd

def read(path):
  with open(path, 'r', encoding='utf-8') as f:
    content = f.read()
  return content

# get path
dn = os.path.dirname(dataset)
pattern = os.path.join(dn, 'aclImdb', 'train', 'pos', '*')
pos = glob.glob(pattern)
pattern = os.path.join(dn, 'aclImdb', 'train', 'neg', '*')
neg = glob.glob(pattern)
sentiments = [1]*len(pos) + [0]*len(neg)

contents = map(read, pos+neg)

pd.DataFrame(contents)

Unnamed: 0,0
0,The movie is great and I like the story. I pre...
1,This is of of Sammo's great early comedy films...
2,"As a long time Red Sox fan, I just had to go s..."
3,This movie awed me so much that I watch it at ...
4,This adaptation of M.R. James's short story 'A...
...,...
24995,It is unbelievable that a script as cliché and...
24996,A film like Amazing Grace and Chuck is a perfe...
24997,This is the biggest load of crap that I have s...
24998,"I awake suddenly, aware that I'm drooling onto..."


In [4]:
# map (similar as pd apply)
'-'.join(map(str, [1,2,3,4,5,6]))

'1-2-3-4-5-6'

In [5]:
def get_data(t):
  dn = os.path.dirname(dataset)
  pattern = os.path.join(dn, "aclImdb", t, "pos", "*.txt")
  pos = glob.glob(pattern)
  pattern = os.path.join(dn, "aclImdb", t, "neg", "*.txt")
  neg = glob.glob(pattern)
  sentiments = [1] * len(pos) + [0] * len(neg)
  contents = map(read, pos + neg)
  df = pd.DataFrame({
      "contents":contents,
      "sentiment":sentiments
  })
  return df

train_df = get_data('train')
test_df = get_data('test')
test_df

Unnamed: 0,contents,sentiment
0,"It kept me on the edge of my seat. True, the s...",1
1,Born in 1946 I was about eight years old when ...,1
2,In the 60's Cleveland television audiences cou...,1
3,I think it is a brilliant show with cool talki...,1
4,John Wayne's first starring role just blew me ...,1
...,...,...
24995,I am not a big fan of the Spielberg/Cruise ver...,0
24996,The director infuses this film with false dept...,0
24997,"As a flying and war movie buff, this ranks at ...",0
24998,I rented this movie under the impression that ...,0


In [6]:
# parameter
TOK = 3000
LEN = 512
EM = 128

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

tok = Tokenizer(num_words=TOK)
# fit_on_texts: fit_transform的fit
tok.fit_on_texts(train_df['contents'])
# seq: 把所有單字換成數字的序列
x_train_seq = tok.texts_to_sequences(train_df['contents'])
x_test_seq = tok.texts_to_sequences(test_df['contents'])

pd.DataFrame(x_train_seq)
# One-hot encoding will cause massive loads on RAM.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1776,1777,1778,1779,1780,1781,1782,1783,1784,1785,1786,1787,1788,1789,1790,1791,1792,1793,1794,1795,1796,1797,1798,1799,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,1,17,6,84,2,10,37,1,62.0,10.0,2781.0,11.0,17.0,71.0,82.0,17.0,138.0,1.0,2753.0,1189.0,17.0,2.0,705.0,17.0,10.0,40.0,373.0,1.0,879.0,10.0,37.0,3.0,632.0,2.0,79.0,3.0,280.0,62.0,7.0,7.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,11,6,4,4,84,399,209,105,11.0,215.0,3.0,2103.0,4.0,2537.0,1.0,2782.0,1.0,290.0,106.0,6.0,2186.0,16.0,1474.0,844.0,2.0,87.0,70.0,15.0,3.0,129.0,4.0,24.0,62.0,41.0,86.0,24.0,993.0,938.0,24.0,444.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,14,3,193,55,764,334,10,40,66.0,5.0,137.0,64.0,1.0,17.0,9.0,13.0,84.0,134.0,47.0,67.0,112.0,27.0,192.0,409.0,926.0,36.0,1.0,764.0,808.0,47.0,68.0,84.0,660.0,4.0,46.0,4.0,58.0,511.0,764.0,1838.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,11,17,69,35,73,12,10,103,9.0,30.0,219.0,277.0,3.0,288.0,30.0,208.0,10.0,166.0,9.0,30.0,208.0,10.0,166.0,9.0,2.0,10.0,207.0,166.0,1.0,102.0,403.0,2.0,144.0,9.0,6.0,3.0,17.0,12.0,284.0,22.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,11,1250,4,1976,1477,343,62,646,36.0,3.0,13.0,83.0,614.0,20.0,695.0,696.0,8.0,20.0,1.0,114.0,293.0,1305.0,2974.0,467.0,10.0,216.0,12.0,9.0,13.0,109.0,2444.0,171.0,20.0,2974.0,467.0,2.0,869.0,5.0,199.0,9.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,9,6,1295,12,3,226,14,1593,2.0,337.0,1753.0,97.0,94.0,98.0,265.0,57.0,1.0,389.0,28.0,1.0,411.0,8.0,11.0,17.0,163.0,303.0,37.0,3.0,309.0,1177.0,353.0,1511.0,295.0,1.0,75.0,2001.0,392.0,1643.0,1734.0,1298.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24996,3,19,37,477,1695,2,6,3,401.0,459.0,4.0,86.0,1.0,314.0,8.0,11.0,701.0,40.0,149.0,76.0,9.0,33.0,112.0,119.0,2.0,1992.0,5.0,1034.0,2.0,1034.0,80.0,989.0,7.0,7.0,11.0,19.0,2026.0,16.0,3.0,114.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24997,11,6,1,1124,4,592,12,10,25.0,107.0,8.0,3.0,193.0,55.0,1.0,233.0,55.0,10.0,1796.0,3.0,17.0,35.0,73.0,13.0,1861.0,146.0,501.0,300.0,2.0,47.0,6.0,424.0,54.0,210.0,5.0,11.0,17.0,546.0,5.0,64.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24998,10,1081,1884,12,143,1644,1,1106,2.0,920.0,42.0,3.0,2265.0,2322.0,2660.0,135.0,13.0,10.0,2765.0,119.0,10.0,566.0,58.0,416.0,39.0,2503.0,29.0,4.0,58.0,97.0,9.0,27.0,1155.0,7.0,7.0,54.0,10.0,20.0,704.0,134.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [8]:
# tok.word_index
tok.index_word[6]

'is'

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# padding: 截長or補短
# truncating: pre 12345 -> 45, post 12345 -> 12 
x_train_pad = pad_sequences(x_train_seq, LEN)
x_test_pad = pad_sequences(x_test_seq, LEN)
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,17,6,84,2,10,37,1,62,10,2781,11,17,71,82,17,138,1,2753,1189,17,2,705,17,10,40,373,1,879,10,37,3,632,2,79,3,280,62,7,7
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,158,33,25,98,144,325,81,8,2576,1983,8,70,10,479,204,107,192,425,1211,14,605,1614,81,8,151,296,99,96,7,7,11,6,28,6,15,98,39,1474,844,334
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,76,5,810,1624,6,639,3,212,64,45,22,23,264,15,3,17,5,103,16,126,1301,39,2665,82,258,45,22,23,3,2353,334,2,57,50,258,45,22,23,764,334
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,157,2,254,82,15,125,39,430,1755,119,3,125,289,4,3,746,437,8,1,390,4,632,71,19,259,10,255,95,196,1768,8,65,203,93,11,19,6,3,212,64
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,71,1670,231,9,97,25,410,3,368,84,1250,14,9,6,9,29,761,3,114,2,3,224,50,5,267,1,1306,59,25,74,2373,7,7,10,199,9,690,43,4,155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,18,89,448,1,94,65,903,14,70,1,862,274,77,126,122,45,22,116,39,141,24,2150,5,2220,43,57,50,10,654,26,97,199,69,145,104,631,4,58,110,142
24996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2770,34,1072,11,7,7,1,19,6,287,61,238,4,155,379,7,7,45,22,178,5,64,3,84,19,41,1,4,322,1227,16,881,302,7,7,440,22,23,54,1260
24997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,44,573,704,113,2,6,52,361,349,58,461,2,10,848,676,122,1,17,100,41,317,32,531,72,66,801,5,199,9,3,577,18,161,97,604,11,78,21,103,9
24998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,53,3,168,669,68,457,8,1,67,2,26,56,12,234,29,1,795,77,27,2356,54,28,77,147,86,370,9,6,2,26,56,67,398,1,1779,7,7,871,74,2808


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Flatten, Dropout

layers = [
      # 1 to 3000 (token) + 0 (padding)
      # param 3001 * 128
      Embedding(TOK+1, EM, mask_zero=True, input_length=LEN),
      Flatten(),
      Dense(128, activation='relu'),
      Dropout(0.25),
      Dense(2, activation='softmax')
]

model = Sequential(layers)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 512, 128)          384128    
_________________________________________________________________
flatten (Flatten)            (None, 65536)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               8388736   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 8,773,122
Trainable params: 8,773,122
Non-trainable params: 0
_________________________________________________________________


In [11]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer=Adam(),
       metrics=["accuracy"])

y_train = train_df["sentiment"]
y_test = test_df["sentiment"]

callbacks = [
    ModelCheckpoint("model.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad, 
     y_train,
     batch_size=100,
     epochs=50,
     validation_split=0.1,
     callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


<tensorflow.python.keras.callbacks.History at 0x7fb3cfae4828>

In [12]:
model.evaluate(x_test_pad, y_test)



[0.30636370182037354, 0.8694400191307068]

In [13]:
"""Smaller model to avoid overfitting"""
from tensorflow.keras.layers import GlobalAveragePooling1D

layers = [
  Embedding(TOK+1, EM, mask_zero=True, input_length=LEN),
  GlobalAveragePooling1D(),
  Dense(2, activation="softmax")  
]
model = Sequential(layers)
model.summary()
# This model is actually a linear model (no activation).

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 512, 128)          384128    
_________________________________________________________________
global_average_pooling1d (Gl (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
Total params: 384,386
Trainable params: 384,386
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer=Adam(),
       metrics=["accuracy"])

callbacks = [
    ModelCheckpoint("model.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad, 
     y_train,
     batch_size=100,
     epochs=50,
     validation_split=0.1,
     callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


<tensorflow.python.keras.callbacks.History at 0x7fb3ccee5588>

In [15]:
model.evaluate(x_test_pad, y_test)



[0.2877269685268402, 0.8821600079536438]

In [16]:
"""Transfer"""
layers = [
      Embedding(TOK+1, EM, mask_zero=True),
      GlobalAveragePooling1D()
]

w = model.layers[0].get_weights()
infer = Sequential(layers)
layers[0].set_weights(w)
infer.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         384128    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
Total params: 384,128
Trainable params: 384,128
Non-trainable params: 0
_________________________________________________________________


In [17]:
infer.layers[0].get_weights()

[array([[-0.02218994,  0.00415281, -0.04009752, ...,  0.01570335,
          0.01934442, -0.03692819],
        [ 0.01228991, -0.00661693, -0.05040083, ..., -0.06080383,
          0.04308648,  0.02012643],
        [-0.0095779 , -0.0284681 ,  0.00342008, ..., -0.03096187,
          0.05158565,  0.04923393],
        ...,
        [-0.1728997 , -0.14244229, -0.17750655, ..., -0.18114968,
          0.14085631,  0.16097908],
        [-0.13334316, -0.08936356, -0.1524125 , ..., -0.15768898,
          0.12056544,  0.07930259],
        [-0.00436962,  0.04573396, -0.00558579, ..., -0.00042681,
          0.03731621,  0.01035265]], dtype=float32)]

In [18]:
# 2500, 512
# [512 list, 512 list]
target = 'like'
# [1 list]
pre = infer.predict([[tok.word_index[target]]])
print(pre.shape)
print(pre[0])

(1, 128)
[ 0.0080252  -0.02995979  0.04983772 -0.01563751  0.00305136  0.01015622
  0.03438783 -0.02202568 -0.04169523 -0.02366203 -0.0009949   0.01002031
  0.01515875  0.02979215  0.0080432   0.0057831   0.01696806  0.04761397
 -0.04691017  0.02165945 -0.00587998  0.00931133  0.02309883  0.03182927
 -0.04906029  0.01026152 -0.01076967  0.02629391  0.0337213  -0.03321799
  0.0031515   0.05292618  0.03292002 -0.0581564  -0.02957845  0.03056128
 -0.04184957 -0.03976836 -0.0417236   0.01729242 -0.00209633  0.01507044
 -0.02631672 -0.01934501  0.0210089  -0.00767471  0.02637418  0.04384295
  0.04369488 -0.00324084  0.04800795 -0.00501041  0.0384395   0.03132186
 -0.01161936 -0.02700825  0.02187215 -0.01637232 -0.00180008  0.04083262
 -0.0038489   0.00554547 -0.03265176 -0.04943574 -0.03201077 -0.00017197
 -0.01198093 -0.02868249  0.00455019  0.00852276  0.02849007  0.02419498
 -0.00625032  0.04185596  0.0168527  -0.0008918   0.03547523  0.00821815
 -0.00297699 -0.04017522  0.05617436 -0.02