# This is Koshkimbayeva Zhaniya's NLP Week-13 Homework

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import numpy as np
import pandas as pd
import re

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# from google.colab import drive
# drive.mount('/content/drive/Data')

In [3]:
# os.getcwd()
# os.listdir()

In [4]:

with open('train_positive.txt', 'r', encoding='utf-8') as f:
    texts_true = f.read().splitlines()

with open('train_negative.txt', 'r', encoding='utf-8') as f:
    texts_false = f.read().splitlines()

texts = texts_true + texts_false
count_true = len(texts_true)
count_false = len(texts_false)
total_lines = count_true + count_false

print("Positive:", count_true)
print("Negative:", count_false)
print("Total:", total_lines)

maxWordsCount = 10000

tokenizer = Tokenizer(
    num_words=maxWordsCount,
    oov_token="<UNK>",
    filters='!–"—#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r«»',
    lower=True,
    split=' ',
    char_level=False
)

tokenizer.fit_on_texts(texts)

print("Example word counts:", list(tokenizer.word_counts.items())[:10])
print("First text sample:", texts[0])

max_text_len = 100

data = tokenizer.texts_to_sequences(texts)
data_pad = pad_sequences(data, maxlen=max_text_len)

print("Data shape:", data_pad.shape)

Y = np.array([[1, 0]] * count_true + [[0, 1]] * count_false)

# Shuffle dataset
indices = np.random.permutation(len(texts))
X = data_pad[indices]
Y = Y[indices]

print("X shape:", X.shape)
print("Y shape:", Y.shape)

model = Sequential()
model.add(Embedding(maxWordsCount, 128, input_length=max_text_len))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(2, activation='softmax'))

model.summary()

model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(0.0001),
    metrics=['accuracy']
)

history = model.fit(X, Y, batch_size=32, epochs=100, validation_split=0.1)

reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

def sequence_to_text(seq):
    return " ".join(reverse_word_map.get(i, "<UNK>") for i in seq)


Positive: 3127
Negative: 3038
Total: 6165
Example word counts: [('сегодня', 1031), ('я', 2464), ('чувствую', 1160), ('радость', 255), ('и', 3205), ('внутреннее', 209), ('спокойствие', 769), ('это', 1958), ('вызывает', 540), ('у', 753)]
First text sample: Сегодня я чувствую радость и внутреннее спокойствие. И это вызывает у меня особые чувства.
Data shape: (6165, 100)
X shape: (6165, 100)
Y shape: (6165, 2)




Epoch 1/100
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 31ms/step - accuracy: 0.6852 - loss: 0.6358 - val_accuracy: 0.9611 - val_loss: 0.1323
Epoch 2/100
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9553 - loss: 0.1080 - val_accuracy: 0.9660 - val_loss: 0.0765
Epoch 3/100
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9658 - loss: 0.0697 - val_accuracy: 0.9692 - val_loss: 0.0579
Epoch 4/100
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9727 - loss: 0.0497 - val_accuracy: 0.9708 - val_loss: 0.0557
Epoch 5/100
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9824 - loss: 0.0435 - val_accuracy: 0.9741 - val_loss: 0.0476
Epoch 6/100
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.9809 - loss: 0.0388 - val_accuracy: 0.9838 - val_loss: 0.0442
Epoch 7/100
[1

### We test on 1st couple for negatives, then positive, and mixes with bias on positive or negative(more negative sentences than positive ones, for example)

In [9]:
test_files = [
    "test_1_neg.txt",
    "test_2_neg.txt",
    "test_3_pos.txt",
    "test_4_pos.txt",
    "test_5_mix_neg.txt",
    "test_6_mix_pos.txt",
]

results = []

for file in test_files:

    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        lines[0] = lines[0].replace("\ufeff", "")

    text = " ".join(line.strip() for line in lines)

    seq = tokenizer.texts_to_sequences([text])
    seq_pad = pad_sequences(seq, maxlen=max_text_len)

    pred = model.predict(seq_pad, verbose=0)[0]
    pred_class = int(np.argmax(pred))

    results.append({
        "file": file,
        "predicted_class": pred_class,
        "raw_output": pred
    })

df_results = pd.DataFrame(results)
df_results


Unnamed: 0,file,predicted_class,raw_output
0,test_1_neg.txt,1,"[0.0001213456, 0.99987864]"
1,test_2_neg.txt,1,"[9.3736355e-07, 0.99999905]"
2,test_3_pos.txt,0,"[1.0, 6.0736403e-09]"
3,test_4_pos.txt,0,"[1.0, 6.146711e-09]"
4,test_5_mix_neg.txt,1,"[0.12940292, 0.8705971]"
5,test_6_mix_pos.txt,0,"[1.0, 3.554624e-08]"


### Validating, with free-hand text(My own)

In [10]:
def predict_sentiment(text: str):

    seq = tokenizer.texts_to_sequences([text])
    seq_pad = pad_sequences(seq, maxlen=max_text_len)

    prediction = model.predict(seq_pad)[0]
    label = np.argmax(prediction)

    if label == 1:
        sentiment = "NEGATIVE"
    else:
        sentiment = "POSITIVE"

    return sentiment, prediction

sentiment, raw = predict_sentiment("Куда катится мир?")
print(sentiment)
print(raw)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
NEGATIVE
[2.0656373e-06 9.9999797e-01]
