In [10]:
# Імпортуємо бібліотеки
import spacy
import pandas as pd
import re
from spacy.tokens import DocBin

In [11]:
# Зчитуємо датасет (завантажено з https://www.kaggle.com/datatattle/covid-19-nlp-text-classification)
trainDF = pd.read_csv("./Corona_NLP_train.csv", encoding='ISO-8859-1')
testDF = pd.read_csv("./Corona_NLP_test.csv")
trainDF.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [12]:
# Реорганізуємо дані: 
# зменшуємо датасет (для швидшого тренування)
# та робимо train-test-validation split

from sklearn.model_selection import train_test_split

allDF = pd.concat((trainDF, testDF), ignore_index=True)
allDF = allDF.sample(frac=0.5).reset_index(drop=True)

trainDF, testDF = train_test_split(allDF, test_size = 0.2)
testDF, validDF = train_test_split(testDF, test_size = 0.2)

print("Train:",len(trainDF), "Test:", len(testDF),"Valid:", len(validDF))

Train: 17982 Test: 3596 Valid: 900


In [13]:
# Препроцессинг: видаляємо посилання та кодуємо лейбли через one-hot encoding

def remove_url(text): 
    parsed_text = re.sub(r"\S*https?:\S*", "", text, flags=re.MULTILINE)
    return parsed_text

def preprocess(df, embed):
    df.OriginalTweet = df.OriginalTweet.apply(remove_url)
    data = tuple(zip(df.OriginalTweet.tolist(), df.Sentiment.tolist())) 
    
    # Перетворюємо потоки слів на вектори (embedding)
    nlp = spacy.load(embed)
    docs = []

    for doc, label in nlp.pipe(data, as_tuples=True):
        doc.cats['extremely_positive'] = 0
        doc.cats['extremely_negative'] = 0
        doc.cats['positive'] = 0
        doc.cats['negative'] = 0
        doc.cats['neutral']  = 0
        
        if label=='Extremely Positive':
            doc.cats['extremely_positive'] = 1
        elif label=='Positive':
            doc.cats['extremely_negative'] = 1
        elif label=='Neutral':
            doc.cats['neutral']  = 1
        elif label=='Negative':
            doc.cats['negative'] = 1
        else:
            doc.cats['extremely_negative'] = 1
        
        docs.append(doc)
        
    return df, docs


In [7]:
# конфігурація: Використовуємо дефолтний згенерований конфіг spaCy для категоризації тексту
# https://spacy.io/usage/training#quickstart
!python -m spacy init fill-config ./base_config.cfg ./config.cfg 

2022-12-02 09:10:30.168268: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
# препроцессимо датасет та зберігаємо в файл .spacy

train_data, train_docs = preprocess(trainDF, "en_core_web_sm")
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./textcat_train.spacy")

test_data, test_docs = preprocess(testDF, "en_core_web_sm")
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk("./textcat_test.spacy")

In [13]:
# Тренуємо модель: під капотом використовується tok2vec + bag-of-words + roberta
!python -m spacy train ./config.cfg --verbose --output ./textcat_output --paths.train ./textcat_train.spacy --paths.dev ./textcat_test.spacy

2022-12-02 05:02:19.868313: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2022-12-02 05:02:21,974] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;2m✔ Created output directory: ./textcat_output[0m
[38;5;4mℹ Saving to output directory: ./textcat_output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-12-02 05:02:22,287] [INFO] Set up nlp object from config
[2022-12-02 05:02:22,299] [DEBUG] Loading corpus from path: ./textcat_test.spacy
[2022-12-02 05:02:22,300] [DEBUG] Loading corpus from path: ./textcat_train.spacy
[2022-12-02 05:02:22,300] [INFO] Pipeline: ['transformer', 'textcat']
[2022-12-02 05:02:22,303] [INFO] Created vocabulary
[2022-12-02 05:02:22,305] [INFO] Finished initializing nlp object
Downlo

In [15]:
# Валідуємо модель

valid_data, valid_docs = preprocess(validDF, "en_core_web_sm")
nlp_model = spacy.load("./textcat_output/model-best")
valid_text = valid_data.OriginalTweet.tolist()
valid_cats = valid_data.Sentiment.tolist()
doc_valid = nlp_model(valid_text[50])
print("Text: " + valid_text[50])
print("Original category: "+ valid_cats[50])
print("Predicted:") 
print(doc_valid.cats)

Text: I hate when I go to the store to see food and toilet paper gone off the shelves people need to stop panic buying because it really got out a hand @JoshuaRush #Covid_19 #CoronavirusPandemic #coronavirus
Original category: Extremely Negative
Predicted:
{'extremely_positive': 0.0007450793054886162, 'extremely_negative': 0.9142106771469116, 'positive': 7.710257705184631e-06, 'negative': 0.08471741527318954, 'neutral': 0.00031904916977509856}
