<br>

# Training a regression model for document classification

In [20]:
df = pd.read_csv('/content/drive/MyDrive/01NLP/data/movie_data.csv', encoding='utf-8')

In [21]:
df.shape

(50000, 2)

In [22]:
df = df[0:10000]

In [23]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # re.sub 取代
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # :-) ;-( =-D :-P :D :-(
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [24]:
df['review'] = df['review'].map(preprocessor) # or apply

In [25]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [26]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
# TfidfVectorizer 內設定

In [27]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
# TfidfVectorizer 內設定

In [28]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.1, random_state=1, stratify=df['sentiment'])
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 9000
TEST size: 1000


In [29]:
X_train = df_train['review'].values # values 轉 array
y_train = df_train['sentiment'].values
X_test = df_test['review'].values
y_test = df_test['sentiment'].values

In [30]:
X_train[0]

'in the previews the 40 year old virgin boasts the image of another immature sex romp about a 40 ish lonely guy who suddenly feels the urge to do the deed simply because he hasn t too many past bad experiences have dampened his enthusiasm to the point that he avoids women completely and then the unexpected happens he falls in love what s more there s a movie out about it and it s called the 40 year old virgin the virgin of the title is andy stitzer steve carell who is indeed 40 works as an employee at an electronics store and collects vintage action figures which are displayed all throughout his nice bachelor pad for all to see he has a lovely home theater system and watches survivor with his two kind elderly neighbors he s a pretty picturesque definition of the lonely guy who needs to go out more and talk to more women now here s the real novelty with this picture it does the impossible task of actually dealing with its subject matter in a cute mature fashion this is a movie that coul

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer # not TfidfTransformer which uses TF from CountVectorizer as input

tfidf = TfidfVectorizer(stop_words=stop, tokenizer=tokenizer_porter)
X_train = tfidf.fit_transform(X_train).toarray() # X_train: array
# tfidf: 文字 轉 數字特徵向量



In [38]:
X_train.shape

(9000, 35042)

In [39]:
X_test = tfidf.transform(X_test).toarray() # X_test: array

In [40]:
X_test.shape

(1000, 35042)

In [44]:
X_train[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
# logistic regression: classification

In [41]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

In [42]:
lr.score(X_test, y_test) #accuracy

0.87

In [None]:
# nn model dense layers

In [43]:
from tensorflow.keras import models
from tensorflow.keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(1, activation='sigmoid')) # 輸出:分類為1之機率

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                560688    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dropout_1 (Dropout)         (None, 8)                 0         
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 560833 (2.14 MB)
Trainable params: 560833 (2.14 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [45]:
from tensorflow.keras import losses, metrics, optimizers

model.compile(optimizer=optimizers.Adam(learning_rate=0.001),
              loss=losses.binary_crossentropy,
              metrics=['binary_accuracy'])

In [46]:
history = model.fit(X_train,
                    y_train,
                    epochs=10,
                    batch_size=512,
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [47]:
model.evaluate(X_test, y_test)



[0.3930196166038513, 0.871999979019165]



---

