In [369]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.model_selection import train_test_split

from tensorflow.keras.datasets import imdb
from tensorflow.keras.datasets import reuters
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Flatten, Embedding, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.utils import pad_sequences, to_categorical

In [125]:
word_index = imdb.get_word_index()
index_word = {v:k for k,v in word_index.items()}

In [126]:
(X_, y_), (X_test, y_test) = imdb.load_data()

In [127]:
index_word.get(X_[0][0])

'the'

In [128]:
def imdb_comment(x):
    comment = ''
    for word in x:
        comment += index_word.get(word) + ' '
    return comment

In [129]:
imdb_comment(X_[0])

"the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room titillate it so heart shows to years of every never going villaronga help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but pratfalls to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other tricky in of seen over landed for anyone of gilmore's br show's to whether from than out themselves history he name half some br of 'n odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but wh

In [130]:
y_[0]

1

In [131]:
X_.shape

(25000,)

In [132]:
pd.Series(X_).apply(len).max()

2494

In [133]:
word_set = []
for word in X_:
    word_set.extend(word)

In [134]:
word_cnt = Counter(word_set)

In [135]:
word_cnt.most_common(1000)

[(4, 336148),
 (5, 164097),
 (6, 163040),
 (7, 145847),
 (8, 135708),
 (9, 107313),
 (10, 101871),
 (11, 93934),
 (12, 79058),
 (13, 77142),
 (14, 75974),
 (15, 69787),
 (16, 48195),
 (17, 46927),
 (18, 44335),
 (19, 44122),
 (20, 43564),
 (21, 42594),
 (22, 39095),
 (23, 34188),
 (24, 30610),
 (25, 29877),
 (26, 29425),
 (27, 29366),
 (28, 27726),
 (29, 26952),
 (30, 26948),
 (31, 26513),
 (1, 25000),
 (32, 23953),
 (33, 23507),
 (34, 22539),
 (35, 21538),
 (36, 21139),
 (37, 20599),
 (38, 20586),
 (39, 20494),
 (40, 20272),
 (41, 18407),
 (42, 17994),
 (43, 17759),
 (44, 17371),
 (45, 17153),
 (46, 17092),
 (48, 16790),
 (47, 16790),
 (49, 15743),
 (50, 15735),
 (51, 15349),
 (52, 15100),
 (53, 14246),
 (54, 14175),
 (55, 14062),
 (56, 13274),
 (57, 12690),
 (58, 12682),
 (59, 12657),
 (60, 12650),
 (61, 12492),
 (62, 12238),
 (63, 12041),
 (64, 11915),
 (65, 11892),
 (66, 11734),
 (67, 11460),
 (68, 11376),
 (69, 11287),
 (70, 11120),
 (71, 10783),
 (72, 10742),
 (73, 10644),
 (74, 

In [136]:
words = [words for words, _ in word_cnt.most_common(1000)]

In [137]:
X_[0][0]

1

In [138]:
def common_word(x):
    for idx,word in enumerate(x):
        if word not in words: ## most_common(1000)
            x[idx] = 0
    return x

In [139]:
def remove_zero(x):
    return np.array(x)[np.array(x) > 0]

In [140]:
X_ = pd.Series(X_).apply(common_word,1).apply(remove_zero,1)

In [141]:
np.ceil(X_.apply(len).mean()).astype(int)

183

In [142]:
def comment_slice(x):
    if len(x) < 183:
        tmp = [0] * (183 - (len(x)))
        x = np.append(x, tmp)
    return x[:183]

In [143]:
X_ = X_.apply(comment_slice,1)

In [144]:
X_test = pd.Series(X_test).apply(common_word,1).apply(remove_zero,1).apply(comment_slice,1)

In [145]:
X_test.apply(len)

0        183
1        183
2        183
3        183
4        183
        ... 
24995    183
24996    183
24997    183
24998    183
24999    183
Length: 25000, dtype: int64

In [146]:
X_train, X_validation, y_train, y_validation = train_test_split(X_, y_)

In [157]:
X_train_trans = X_train.values[0].reshape(-1,183)
for i in X_train.values[1:]:
    X_train_trans = np.append(X_train_trans,i.reshape(-1,183),0)

In [159]:
X_validation_trans = X_validation.values[0].reshape(-1,183)
for i in X_validation.values[1:]:
    X_validation_trans = np.append(X_validation_trans,i.reshape(-1,183),0)

In [160]:
model = Sequential(
[Dense(1024, activation = 'relu', input_shape=(183,)),
BatchNormalization(),
Dense(253, activation = 'relu'),
BatchNormalization(),
Dense(32, activation = 'relu'),
BatchNormalization(),
Dense(1, activation = 'sigmoid')])




In [161]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics='accuracy')




In [162]:
model.fit(X_train_trans, y_train, epochs=20, validation_data=(X_validation_trans,y_validation))

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1b24952dcd0>

In [186]:
np.where(model.predict(X_train_trans[[1557]]) > 0.5, 1, 0)[0][0], y_train[1557]



(1, 1)

In [188]:
np.where(model.predict(X_test[1557].reshape(1,-1)) > 0.5, 1, 0)[0][0], y_test[1557]



(1, 1)

In [223]:
comment = """
On 8 April, there will still be six full Premier League match rounds to play, with the possibility of some rearranged games, meaning it is likely the relegation places will not have been decided.

So any decision that ends in a points deduction is virtually certain to be subject to an appeal, even as a pre-emptive strike by the clubs - and that brings the potential for chaos as there is no guarantee any appeal would be heard before the final games of the campaign on 19 May.

Those final-day fixtures? Burnley v Nottingham Forest and Arsenal v Everton.

In terms of actually playing matches, that is when the season ends. But in terms of regulation, the 2023-24 season remains 'live' until the annual general meeting in June when the relegated clubs transfer their certificates. It is this point at which everything has to be determined.
"""

In [221]:
def remove_words(x):
    x = x.replace('.','')
    x = x.replace(',','')
    x = x.replace('"','')
    x = x.replace('?','')
    x = x.replace('!','')
    x = x.replace("'",'')
    return x

In [236]:
def imdb_predict(model, comment):
    model = model
    word = []
    for i in comment.split(' '):
        i = remove_words(i)
        if word_index.get(i) != None:
            word.append(word_index.get(i))
    word = common_word(word)
    word = remove_zero(word)
    word = comment_slice(word)
    word = word.reshape(1,-1)
    return np.where(model.predict(word,verbose=0)[0] > 0.5, '긍정', '부정')[0]

In [237]:
imdb_predict(model,comment)

'부정'

In [238]:
from tensorflow.keras.layers import Embedding

In [248]:
model1 = Sequential(
[
    Embedding(1000, 8, input_length=183),
    Flatten(),
    Dense(128,activation='relu'),
    BatchNormalization(),
    Dense(1,activation='sigmoid')
])

In [251]:
model1.compile(optimizer='adam',loss='binary_crossentropy',metrics='accuracy')

In [253]:
model1.predict(np.random.randint(1,1001,183).reshape(1,-1))



array([[0.49550107]], dtype=float32)

In [304]:
(X_, y_), (X_test, y_test) = imdb.load_data()
X = np.append(X_, X_test, 0)
y = np.append(y_, y_test)

In [305]:
# pad_sequences(X_test, 200)

In [306]:
word_index = imdb.get_word_index()
index_word = {v:k for k,v in word_index.items()}

In [307]:
for idx1, ind_word in enumerate(X):
    for idx2, val in enumerate(ind_word):
        X[idx1][idx2] = index_word.get(X[idx1][idx2])

In [308]:
word_list = []
for words in X:
    word_list.extend(words)

In [309]:
word_freq = Counter(word_list).most_common(1000)

In [310]:
word_to_index = {v:k for k, (v,_) in enumerate(word_freq,1)}

In [311]:
index_to_word = {v:k for k,v in word_to_index.items()}

In [312]:
for idx1, ind_word in enumerate(X):
    for idx2, val in enumerate(ind_word):
        if word_to_index.get(X[idx1][idx2]) != None:
            X[idx1][idx2] = word_to_index.get(X[idx1][idx2])
        else:
            X[idx1][idx2] = 0

In [316]:
X = pd.Series(X).apply(lambda x:np.array(x)[np.array(x)>0]).values

In [318]:
X = pad_sequences(X,200)

In [319]:
X_, X_test, y_, y_test = train_test_split(X,y)

In [322]:
X_train, X_validation, y_train, y_validation = train_test_split(X_, y_)

In [324]:
y_train = y_train.reshape(-1,1)
y_validation = y_validation.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [329]:
model_embed = Sequential(
[
    Embedding(1001, 8, input_length=200),
    Dropout(0.5),
    Flatten(),
    Dense(256,activation='relu'),
    BatchNormalization(),
    Dense(128,activation='relu'),
    BatchNormalization(),
    Dense(10,activation='relu'),
    BatchNormalization(),
    Dense(1,activation='sigmoid')
])

In [330]:
model_embed.compile(optimizer='adam',loss='binary_crossentropy',metrics='accuracy')

In [331]:
model_embed.fit(X_train, y_train, epochs=30, validation_data=(X_validation, y_validation))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x1b267d16350>

In [332]:
model_embed.evaluate(X_test,y_test)



[0.4440276622772217, 0.8420799970626831]

In [352]:
(X_, y_), (X_test, y_test) = reuters.load_data()
X = np.append(X_, X_test, 0)
y = np.append(y_, y_test)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz


In [353]:
word_index = imdb.get_word_index()
index_word = {v:k for k,v in word_index.items()}

In [354]:
for idx1, ind_word in enumerate(X):
    for idx2, val in enumerate(ind_word):
        X[idx1][idx2] = index_word.get(X[idx1][idx2])

In [355]:
word_list = []
for words in X:
    word_list.extend(words)

In [356]:
word_freq = Counter(word_list).most_common(1000)

In [357]:
word_to_index = {v:k for k, (v,_) in enumerate(word_freq,1)}

In [358]:
index_to_word = {v:k for k,v in word_to_index.items()}

In [359]:
for idx1, ind_word in enumerate(X):
    for idx2, val in enumerate(ind_word):
        if word_to_index.get(X[idx1][idx2]) != None:
            X[idx1][idx2] = word_to_index.get(X[idx1][idx2])
        else:
            X[idx1][idx2] = 0

In [360]:
X = pd.Series(X).apply(lambda x:np.array(x)[np.array(x)>0]).values

In [361]:
X = pad_sequences(X,200)

In [362]:
y = to_categorical(y)

In [363]:
X_, X_test, y_, y_test = train_test_split(X,y)

In [364]:
X_train, X_validation, y_train, y_validation = train_test_split(X_, y_)

In [380]:
model_reuter = Sequential(
[
    Embedding(1001, 20, input_length=200),
    Conv1D(40,10,activation='relu'),
    MaxPooling1D(5),
    Flatten(),
    Dense(256,activation='relu'),
    BatchNormalization(),
    Dense(128,activation='relu'),
    BatchNormalization(),
    Dense(46,activation='softmax')
])

In [377]:
model_reuter.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 200, 8)            8008      
                                                                 
 conv1d_3 (Conv1D)           (None, 198, 20)           500       
                                                                 
 max_pooling1d_3 (MaxPoolin  (None, 99, 20)            0         
 g1D)                                                            
                                                                 
 flatten_11 (Flatten)        (None, 1980)              0         
                                                                 
 dense_38 (Dense)            (None, 256)               507136    
                                                                 
 batch_normalization_24 (Ba  (None, 256)               1024      
 tchNormalization)                                   

In [381]:
model_reuter.compile(optimizer='adam',loss='binary_crossentropy',metrics='accuracy')

In [382]:
model_reuter.fit(X_train, y_train, epochs=30, validation_data=(X_validation, y_validation))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x1b244fdac10>

In [383]:
model_reuter.evaluate(X_test, y_test)



[0.07436802983283997, 0.691485583782196]