# Importing Libraries

In [1]:
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt

# noinspection PyUnresolvedReferences
from tensorflow.keras.preprocessing.text import Tokenizer
# noinspection PyUnresolvedReferences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from helper import *





# Loading Data

In [3]:
data_path = "../data/processed_data_temp.csv"
pickle_path = "../data/data.pickle"

model_name = "like_model"

# with open(pickle_path, "rb") as f:
#     data = pickle.load(f)

df = pd.read_csv(data_path)
# drop id, source, date, time, views, forwards, containing_media
df = df.drop(columns=["id", "source", "date", "time", "views", "forwards", "containing_media", "sum"])
df = df.dropna()
df

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc7 in position 51: invalid continuation byte

In [None]:
def filter_string_rows(df):
    data = []
    # Check if row[0] is a string
    for row in df.values:
        if isinstance(row[0], str):
            data.append(row)

    print(f"Data length: {len(data)}")
    return data

def clean_data(data):
    for i in range(len(data)):
        data[i][0] = clean_text(data[i][0])
        if i % 1000 == 0:
            print(f"Cleaning: {i + 1}/{len(data)}", end="\r")
    print(f"Cleaning: {len(data)}/{len(data)}")
    return data

def process_data(data):
    for i in range(len(data)):
        data[i][0] = replace_numbers(data[i][0])
        if i % 1000 == 0:
            print(f"Processing: {i + 1}/{len(data)}", end="\r")
    print(f"Processing: {len(data)}/{len(data)}")
    return data

def articles_answers_crop(data):
    articles = [row[0] for row in data]
    answers = [[float(value) for value in row[1:]] for row in data]
    return articles, answers

def argmax_convertion(answers):
    # convert [0.8, 0.1, 0.1] to [1, 0, 0]
    one_maxtix = np.eye(len(answers[0]))
    for i in range(len(answers)):
        answers[i] = one_maxtix[np.argmax(answers[i])]
        if i % 1000 == 0:
            print(f"Argmax: {i + 1}/{len(answers)}", end="\r")
    print(f"Argmax: {len(answers)}/{len(answers)}")
    return answers


data = filter_string_rows(df)
data = clean_data(data)
data = process_data(data)
articles, answers = articles_answers_crop(data)
answers = argmax_convertion(answers)
print(f"Articles: {len(articles)}, Answers: {len(answers)}")
print(f"Example: {articles[0]}, {answers[0]}")

# Preprocessing Data

In [None]:
(train_articles, train_answers), (validation_articles, validation_answers) = split_data(articles, answers, 0.95)
print(f"Train: {len(train_articles)}, Validation: {len(validation_articles)}")

# Tokenizing, truncating and padding data

In [None]:
vocab_size = 50_000
max_length = 100
trunc_type = "post"
padding_type = "post"
oov_tok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

print(f"Word Index: {len(word_index)}")
print(f"Example from end: {list(word_index.items())[:20]}")

In [None]:
# save tokenizer
with open(f"../models/{model_name}_tokenizer.pickle", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
def words_per_article(articles):
    # return min, max, mean
    min_words = 10**10
    max_words = 0
    total_words = 0
    for article in articles:
        words = len(article.split(" "))
        if words < min_words:
            min_words = words
        if words > max_words:
            max_words = words
        total_words += words
    return min_words, max_words, total_words / len(articles)

min_words, max_words, mean_words = words_per_article(train_articles)
print(f"Min Words: {min_words}, Max Words: {max_words}, Mean Words: {mean_words}")

In [None]:
# plot distribution of words per article
words_per_article = [len(article.split(" ")) for article in train_articles]
plt.hist(words_per_article, bins=100)
plt.xlabel("Words per Article")
plt.ylabel("Articles per Length")
plt.show()

In [None]:
train_padded = seq_pad_and_trunc(train_articles, tokenizer, padding_type, trunc_type, max_length)
validation_padded = seq_pad_and_trunc(validation_articles, tokenizer, padding_type, trunc_type, max_length)
print(f"Train Padded: {train_padded.shape}, Validation Padded: {validation_padded.shape}")

In [None]:
train_answers = np.array(train_answers)
validation_answers = np.array(validation_answers)

In [None]:
# print some examples
for i in range(5):
    print(f"Article: {train_articles[i]}")
    print(f"Article Padded: {train_padded[i]}")
    print(f"Answer: {train_answers[i]}")
    print()

In [None]:
import keras.layers as L

def create_model(vocab_size, embedding_dim, max_length):
    input = L.Input(shape=(max_length,))
    x = L.Embedding(vocab_size, embedding_dim)(input)
    x = L.Flatten()(x)
    x = L.Dense(32, activation="relu")(x)
    x = L.Dropout(0.1)(x)
    x = L.Dense(3, activation="softmax")(x)
    model = tf.keras.Model(inputs=input, outputs=x)
    return model

model = create_model(vocab_size, 16, max_length)

# loss = tf.keras.losses.KLDivergence()
loss = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
metrics = ["accuracy"]
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

model.summary()

In [None]:
epochs = 30
batch_size = 1024
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=f"../models/{model_name}_checkpoint.h5",
    save_weights_only=False,
    monitor="val_accuracy",
    mode="max",
    save_best_only=True
)
history = model.fit(train_padded, train_answers, epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(validation_padded, validation_answers),
                    callbacks=[checkpoint_callback])

In [None]:
def plot_history(history):
    plt.plot(history.history["loss"], label="loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.legend()
    plt.show()

    plt.plot(history.history["accuracy"], label="accuracy")
    plt.plot(history.history["val_accuracy"], label="val_accuracy")
    plt.legend()
    plt.show()

def plot_history_as_subplots(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    ax1.plot(history.history["loss"], label="loss")
    ax1.plot(history.history["val_loss"], label="val_loss")
    ax1.legend()
    ax2.plot(history.history["accuracy"], label="accuracy")
    ax2.plot(history.history["val_accuracy"], label="val_accuracy")
    ax2.legend()
    plt.show()

plot_history_as_subplots(history)

In [None]:
# load best model
model = tf.keras.models.load_model(f"../models/{model_name}_checkpoint.h5")

# evaluate on validation data
model.evaluate(validation_padded, validation_answers)

In [None]:
splited = {}
for i in range(len(validation_answers)):
    varticle = validation_articles[i]
    vanswer = validation_answers[i]

    length = len(varticle.split(" "))
    if length > max_length:
        continue
    if length not in splited:
        splited[length] = []

    splited[length].append((varticle, vanswer))

results = {}
# sort splited by length
splited = dict(sorted(splited.items(), key=lambda item: item[0]))
for key in splited:
    varticles = [row[0] for row in splited[key]]
    vanswers = [row[1] for row in splited[key]]
    varticles_padded = seq_pad_and_trunc(varticles, tokenizer, padding_type, trunc_type, max_length)
    vanswers = np.array(vanswers)
    # print(f"Length: {key}, Articles: {len(varticles)}, Answers: {len(vanswers)}")
    pred = model.evaluate(varticles_padded, vanswers, verbose=0)
    # get mean of loss and accuracy
    results[key] = (pred[0], pred[1])

# make df from results
results_df = pd.DataFrame(results.items(), columns=["Length", "Results"])
# use rolling mean to smooth the data
results_df["Loss"] = results_df["Results"].apply(lambda x: x[0]).rolling(10).mean()
results_df["Accuracy"] = results_df["Results"].apply(lambda x: x[1]).rolling(10).mean()
results_df = results_df.drop(columns=["Results"])

# plot results
plt.plot(results_df["Length"], results_df["Loss"], label="loss")
plt.plot(results_df["Length"], results_df["Accuracy"], label="accuracy")
plt.legend()
plt.show()

In [None]:
# save model
model.save("../models/like_model.h5")

# save tokenizer
with open("../models/like_tokenizer.pickle", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
o_article = "Комітет з питань гуманітарної та інформаційної політики не підтримав відставку Ткаченка. Зазначають, що рішення прийматиме Верховна Рада."

article = clean_text(o_article)
article = replace_numbers(article)
article_padded = seq_pad_and_trunc([article], tokenizer, padding_type, trunc_type, max_length)
article_padded = np.array(article_padded)

predictions = model.predict(article_padded)

count = sum([1 for num in article_padded[0] if num == 1])
print(f"Article: {o_article}")
print(f"Unfamiliar Words: {count}")

plt.bar(df.columns[1:], predictions[0])
plt.gcf().set_size_inches(2, 2)
plt.xticks(rotation=45)
plt.show()

In [None]:
predictions = model.predict(article_padded)
print(f"Predictions: {predictions}")
print(f"columns: {df.columns[1:]}")

In [None]:

with open("../data/temp/data.pickle", "rb") as f:
    data = pickle.load(f)

# convert data to dataframe
test_df = pd.DataFrame(data, columns=['text', 'like', 'heart', 'smile', 'angry', 'sad', 'trigger', 'spread'])

# merge like and heart, angry and sad
for row in test_df.values:
    row[1] = float(row[1]) + float(row[2])
    row[4] = float(row[4]) + float(row[5])
test_df = test_df.drop(columns=["heart", "sad", "trigger", "spread"])
# rename like -> positive, angry -> negative, smile -> neutral
test_df = test_df.rename(columns={"like": "positive", "angry": "negative", "smile": "neutral"})
test_df = test_df.dropna()
test_df

In [None]:
# create validation data from test_df
test_articles = [row[0] for row in test_df.values]
test_answers = [[float(value) for value in row[1:]] for row in test_df.values]
print(f"Test: {len(test_articles)}")

In [None]:
for i in range(len(test_articles)):
    test_articles[i] = clean_text(test_articles[i])
    test_articles[i] = replace_numbers(test_articles[i])
    print(f"{i + 1}/{len(test_articles)}", end="\r")

test_padded = seq_pad_and_trunc(test_articles, tokenizer, padding_type, trunc_type, max_length)
print(f"Test Padded: {test_padded.shape}")

In [None]:
test_answers = np.array(test_answers)

# evaluate on test data
model.evaluate(test_padded, test_answers)

In [None]:
e = model.layers[1]
weights = e.get_weights()[0]
print(weights.shape)

In [None]:
reverse_word_index = tokenizer.index_word

In [None]:
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write("\t".join([str(x) for x in embeddings]) + "\n")

out_v.close()
out_m.close()