In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
!kaggle datasets download -d mrmorj/hate-speech-and-offensive-language-dataset

Downloading hate-speech-and-offensive-language-dataset.zip to /content
  0% 0.00/1.01M [00:00<?, ?B/s]
100% 1.01M/1.01M [00:00<00:00, 135MB/s]


In [None]:
! unzip hate-speech-and-offensive-language-dataset.zip

Archive:  hate-speech-and-offensive-language-dataset.zip
  inflating: labeled_data.csv        


In [None]:
!python -m spacy download ru_core_news_sm

2023-09-29 16:00:37.610458: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting ru-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.6.0/ru_core_news_sm-3.6.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-sm==3.6.0)
  Downloading pymorphy3-1.2.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-sm==3.6.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting

In [None]:
import spacy
import pandas as pd

from tqdm import tqdm
from spacy.tokens import DocBin

nlp = spacy.load("ru_core_news_sm")

In [None]:
df = pd.read_csv('/content/labeled_data.csv')
df["labels"] = df["class"].map({0: "Hate Speech",
                                    1: "Offensive Language",
                                    2: "No Hate and Offensive"})
print(df.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet                 labels  
0  !!! RT @mayasolovely: As a woman you shouldn't...  No Hate and Offensive  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...     Offensive Language  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...     Offensive Language  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...     Offensive Language  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...     Offensive Language  


In [None]:
df = df.sample(frac=0.5)

In [None]:
df = pd.concat([df['tweet'], df['class']], axis=1)
print(df)

                                                   tweet  class
0      !!! RT @mayasolovely: As a woman you shouldn't...      2
1      !!!!! RT @mleew17: boy dats cold...tyga dwn ba...      1
2      !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...      1
3      !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...      1
4      !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...      1
...                                                  ...    ...
24778  you's a muthaf***in lie &#8220;@LifeAsKing: @2...      1
24779  you've gone and broke the wrong heart baby, an...      2
24780  young buck wanna eat!!.. dat nigguh like I ain...      1
24781              youu got wild bitches tellin you lies      1
24782  ~~Ruffled | Ntac Eileen Dahlia - Beautiful col...      2

[24783 rows x 2 columns]


In [None]:
df['sentiment'].value_counts()

negative    15116
neautral    14978
positive    14906
Name: sentiment, dtype: int64

In [None]:
df = df[df['sentiment'] != 'neautral']

In [None]:
data = [tuple(df.iloc[i].values) for i in range(df.shape[0])]
print(data[0:5])

[("!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...", 2), ('!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!', 1), ('!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit', 1), ('!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny', 1), ('!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;', 1)]


In [None]:
df['sentiment'].value_counts()

negative    15116
positive    14906
Name: sentiment, dtype: int64

In [None]:
train_data = data[:20000]
valid_data = data[10000:]

In [None]:
def make_docs(data):
    """
    this will take a list of texts and labels
    and transform them in spacy documents
    data: list(tuple(text, label))
    returns: List(spacy.Doc.doc)
    """
    docs = []
    # nlp.pipe([texts]) is way faster than running
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple,
    # the first one is treated as text
    # the second one will get returned as it is.
    # a = tqdm(nlp.pipe(data, as_tuples=True), total = len(data))
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        doc.cats = {'neutral':0, 'hate':0, 'offence':0}
        if label == 2:
            doc.cats["neutral"] = 1
        elif label == 1:
            doc.cats["offence"] = 1
        elif label == 0:
            doc.cats['hate'] = 1
        # we need to set the (text)cat(egory) for each document
        #doc.cats["positive"] = label
        # put them into a nice list
        docs.append(doc)
    return docs

In [None]:
# we are so far only interested in the first 5000 reviews
# this will keep the training time short.
# In practice take as much data as you can get.
# you can always reduce it to make the script even faster.
num_texts = 9000
# first we need to transform all the training data
train_docs = make_docs(train_data[:num_texts])

100%|██████████| 9000/9000 [00:02<00:00, 3605.50it/s]


In [None]:

# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("train.spacy")
# repeat for validation data
valid_docs = make_docs(valid_data[:num_texts])
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("valid.spacy")

100%|██████████| 9000/9000 [00:01<00:00, 4809.90it/s]


In [None]:
! touch base_config.cfg

на этом месте мы идем в https://spacy.io/usage/training#quickstart, там настраиваем под себя конфиг (textcat), копируем его руками(!) в base_config.cfg, указываем правильные пути до трейн и вэлид


In [None]:
! pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.0-py3-none-any.whl size=12019102 sha256=ed83456a3b4dd7bf56f0dc2081f9ff5697c0d828c6847aed25238e3d226c50bb
  Stored in directory: /root/.cache/pip/wheels/f9/7e/12/0c885b1d01a93f5cfff2e269634078c488729f52129c8f7bde
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3

In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./valid.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.22       30.45    0.30
  0     200         29.37       30.57    0.31
  0     400         21.89       39.71    0.40
  0     600         19.99       48.31    0.48
  0     800         17.49       55.17    0.55
  1    1000         15.81       57.35    0.57
  1    1200         11.81       58.90    0.59
  1    1400         11.26       60.97    0.61
  2    1600          8.70       62.63    0.63
  3    1800          7.47       64.06    0.64
  4    2000          6.50       65.42    0.65
  5    2200          5.21       66.40    0.66
  6    2400          4.33       66.23    0.66
  7    2600          3.67       66.40    0.66
  9    2800          2.99       66.89    0.67
 10    3000        

In [None]:
import spacy
# load thebest model from training
nlp = spacy.load("output/model-best")
text = ""
print("type : ‘quit’ to exit")
# predict the sentiment until someone writes quit
while text != "quit":
    text = input("Please enter example input: ")
    doc = nlp(text)
    print(doc.cats)


type : ‘quit’ to exit
Please enter example input: I LOVE YOU
{'neutral': 0.24863788485527039, 'hate': 0.2441014051437378, 'offence': 0.5072607398033142}
Please enter example input: it is rainy today
{'neutral': 0.41591861844062805, 'hate': 0.2257653772830963, 'offence': 0.358316034078598}
Please enter example input: i love niggers
{'neutral': 0.12157140672206879, 'hate': 0.4343661963939667, 'offence': 0.4440624415874481}
Please enter example input: i hate niggers
{'neutral': 0.08285313099622726, 'hate': 0.6234526634216309, 'offence': 0.29369422793388367}
Please enter example input: i hate racism
{'neutral': 0.3949951231479645, 'hate': 0.28886133432388306, 'offence': 0.31614360213279724}
Please enter example input: quit
{'neutral': 0.31968066096305847, 'hate': 0.20410221815109253, 'offence': 0.4762170910835266}
