In [1]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
import os
import re
import gc
from string import punctuation

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from gensim.models.word2vec import Word2Vec

!pip install -U spacy
!python -m spacy download en_core_web_sm
!python -m spacy download ru_core_news_sm

import spacy
spacy.prefer_gpu()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 640.0 kB/s eta 0:00:20
     --------------------------------------- 0.1/12.8 MB 812.7 kB/s eta 0:00:16
      --------------------------------------- 0.2/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.4/12.8 MB 3.1 MB/s eta 0:00:05
     -- ------------------------------------- 0.8/12.8 MB 4.8 MB/s eta 0:00:03
     --- ------------------------------------ 1.3/12.8 MB 5.7 MB/s eta 0:00:03
     --- ------------------------------------ 1.3/12.8 MB 5.7 MB/s eta 0:00:03
     ----- ---------------------------------- 1.6/12.8 MB 5.8 MB/s eta 0:00:02
     ------- -------------------------------- 2.5/12.8 MB 8.0 MB/s eta 0:00:02
     --------- ------------------------

Collecting ru-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.6.0/ru_core_news_sm-3.6.0-py3-none-any.whl (15.3 MB)
     ---------------------------------------- 0.0/15.3 MB ? eta -:--:--
     --------------------------------------- 0.0/15.3 MB 330.3 kB/s eta 0:00:47
     --------------------------------------- 0.0/15.3 MB 487.6 kB/s eta 0:00:32
     ---------------------------------------- 0.1/15.3 MB 1.0 MB/s eta 0:00:15
      --------------------------------------- 0.4/15.3 MB 2.5 MB/s eta 0:00:07
     - -------------------------------------- 0.6/15.3 MB 3.3 MB/s eta 0:00:05
     -- ------------------------------------- 0.8/15.3 MB 3.6 MB/s eta 0:00:04
     -- ------------------------------------- 1.1/15.3 MB 4.2 MB/s eta 0:00:04
     --- ------------------------------------ 1.3/15.3 MB 4.7 MB/s eta 0:00:03
     ---- ----------------------------------- 1.6/15.3 MB 4.9 MB/s eta 0:00:03
     ----- -------------------------

False

## Получение данных 

In [2]:
url = 'https://raw.githubusercontent.com/netology-ds-team/nlp-homeworks/main/7_Classification_in_AOT/Materials/Constraint_Train.csv'
response = requests.get(url)

if response.status_code == 200:
    with open("Constraint_Train.csv", "wb") as file:
        file.write(response.content)
    print("Файл успешно скачан.")
else:
    print("Не удалось скачать файл. Статус код:", response.status_code)

Файл успешно скачан.


In [3]:
df = pd.read_csv('Constraint_Train.csv')
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [4]:
df.tweet[1]

'States reported 1121 deaths a small rise from last Tuesday. Southern states reported 640 of those deaths. https://t.co/YASGRTT4ux'

In [5]:
sentences = [word_tokenize(text.lower()) for text in tqdm(df.tweet)]

100%|████████████████████████████████████████████████████████████████████████████| 6420/6420 [00:01<00:00, 5536.74it/s]


In [6]:
sentences[1]

['states',
 'reported',
 '1121',
 'deaths',
 'a',
 'small',
 'rise',
 'from',
 'last',
 'tuesday',
 '.',
 'southern',
 'states',
 'reported',
 '640',
 'of',
 'those',
 'deaths',
 '.',
 'https',
 ':',
 '//t.co/yasgrtt4ux']

### 1. gensim.models.Word2Vec 

In [7]:
%time model_tweets = Word2Vec(sentences, workers=6, vector_size=300, min_count=3, window=5, epochs=15)
model_tweets

CPU times: total: 5.98 s
Wall time: 1.46 s


<gensim.models.word2vec.Word2Vec at 0x185297ead10>

In [8]:
model_tweets.wv.vectors.shape

(5266, 300)

In [9]:
def get_text_embedding(text):
    result = []
    for word in word_tokenize(text.lower()):
        if word in model_tweets.wv:
            result.append(model_tweets.wv[word])

    if len(result):
        result = np.sum(result, axis=0)
    else:
        result = np.zeros(300)
    return result

In [10]:
features = [get_text_embedding(text) for text in tqdm(df.tweet)]
features[0].shape

100%|████████████████████████████████████████████████████████████████████████████| 6420/6420 [00:01<00:00, 3962.32it/s]


(300,)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(features, df.label, test_size=0.25)

In [12]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

In [13]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.93      0.91      0.92       754
        real       0.92      0.94      0.93       851

    accuracy                           0.92      1605
   macro avg       0.92      0.92      0.92      1605
weighted avg       0.92      0.92      0.92      1605



### 2. sklearn.feature_extraction.text.CountVectorizer 

In [14]:
vec = CountVectorizer()
bow = vec.fit_transform(df.tweet)

In [15]:
print(f'{len(vec.vocabulary_)=}')
X_train, X_test, y_train, y_test = train_test_split(bow, df.label, test_size=0.25)
model = LogisticRegression()
model.fit(X_train, y_train)

len(vec.vocabulary_)=18385


In [16]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.94      0.94      0.94       794
        real       0.94      0.94      0.94       811

    accuracy                           0.94      1605
   macro avg       0.94      0.94      0.94      1605
weighted avg       0.94      0.94      0.94      1605



### 3. sklearn.feature_extraction.text.CountVectorizer + regex 

In [17]:
pattern = r'https://\S+'
re_sentences = [word_tokenize(re.sub(pattern, '', text.lower())) for text in tqdm(df.tweet)]
print(sentences[1])
print(re_sentences[1])

100%|████████████████████████████████████████████████████████████████████████████| 6420/6420 [00:01<00:00, 6203.75it/s]

['states', 'reported', '1121', 'deaths', 'a', 'small', 'rise', 'from', 'last', 'tuesday', '.', 'southern', 'states', 'reported', '640', 'of', 'those', 'deaths', '.', 'https', ':', '//t.co/yasgrtt4ux']
['states', 'reported', '1121', 'deaths', 'a', 'small', 'rise', 'from', 'last', 'tuesday', '.', 'southern', 'states', 'reported', '640', 'of', 'those', 'deaths', '.']





In [18]:
vec = CountVectorizer()
bow = vec.fit_transform([' '.join(x) for x in re_sentences])

In [19]:
print(f'{len(vec.vocabulary_)=}')
X_train, X_test, y_train, y_test = train_test_split(bow, df.label, test_size=0.25)
model = LogisticRegression()
model.fit(X_train, y_train)

len(vec.vocabulary_)=14315


In [20]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.91      0.94      0.92       758
        real       0.94      0.92      0.93       847

    accuracy                           0.93      1605
   macro avg       0.93      0.93      0.93      1605
weighted avg       0.93      0.93      0.93      1605



### 4. sklearn.feature_extraction.text.CountVectorizer + regex + spacy

In [21]:
filtered_re_sentences = []
for lst in re_sentences:
    lst = list(filter(lambda x: x not in punctuation, lst))
    filtered_re_sentences.append(lst)
    
filtered_re_sentences[0]

['the',
 'cdc',
 'currently',
 'reports',
 '99031',
 'deaths',
 'in',
 'general',
 'the',
 'discrepancies',
 'in',
 'death',
 'counts',
 'between',
 'different',
 'sources',
 'are',
 'small',
 'and',
 'explicable',
 'the',
 'death',
 'toll',
 'stands',
 'at',
 'roughly',
 '100000',
 'people',
 'today']

In [22]:
nlp = spacy.load("en_core_web_sm")

In [23]:
if not(os.path.isfile('fakenews.txt')):
    lemmatized_sentences = []
    for s in tqdm(filtered_re_sentences):
        new_sentence = [token[0].lemma_ for token in map(nlp, s) if token[0].is_stop == False]
        lemmatized_sentences.append(new_sentence)
        
    with open("fakenews.txt", "w", encoding='utf-8') as file:
        for sublist in lemmatized_sentences:
            string = " ".join(sublist) + "\n"
            file.write(string)

In [24]:
fakenews_tweets = []
with open("fakenews.txt", "r", encoding='utf-8') as file:
    for line in file:
        fakenews_tweets.append(line.strip().split())
        
fakenews_tweets[0]

['cdc',
 'currently',
 'report',
 '99031',
 'death',
 'general',
 'discrepancy',
 'death',
 'count',
 'different',
 'source',
 'small',
 'explicable',
 'death',
 'toll',
 'stand',
 'roughly',
 '100000',
 'people',
 'today']

In [25]:
vec = CountVectorizer()
bow = vec.fit_transform(list(map(lambda x: ' '.join(x), fakenews_tweets)))

In [26]:
print(f'{len(vec.vocabulary_)=}')
X_train, X_test, y_train, y_test = train_test_split(bow, df.label, test_size=0.25)
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

len(vec.vocabulary_)=11460


In [27]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.92      0.92      0.92       750
        real       0.93      0.93      0.93       855

    accuracy                           0.92      1605
   macro avg       0.92      0.92      0.92      1605
weighted avg       0.92      0.92      0.92      1605



### 5. RNN

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [29]:
labels = (df.label == 'real').astype(int).to_list()
labels[0:5]

[1, 1, 0, 1, 1]

In [30]:
minlen = len(min(fakenews_tweets, key=len))
avglen = np.mean(list(map(len, fakenews_tweets)))
maxlen = len(max(fakenews_tweets, key=len))
print(minlen)
print(avglen)
print(maxlen)

max_len = 30

1
15.833489096573208
840


In [31]:
emb_size = 300
total_examples = len(labels)
seq_len = 30

In [32]:
features = []
for text in tqdm(fakenews_tweets):
    embeddings = []
    
    for i in range(max_len):
        if i < len(text):
            word = text[i]
            if word in model_tweets.wv.key_to_index:
                embeddings.append(model_tweets.wv[word])
            else:
                embeddings.append(np.zeros(300, dtype=np.float32))
        else:
            embeddings.append(np.zeros(300, dtype=np.float32))
    
    features.append(embeddings)

features[0][0].shape

100%|███████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 30666.30it/s]


(300,)

In [33]:
class RNN_net(nn.Module):

    def __init__(self):
        super(RNN_net, self).__init__()
        self.rnn = nn.RNN(300, 100, batch_first=True)
        self.linear = nn.Linear(100, 1)

    def forward(self, x):
        
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        
        out, hidden = self.rnn(x, hidden)
        output = torch.sigmoid(self.linear(hidden)).squeeze()
        return output
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, 100).to(device)
        return hidden

In [34]:
XXX = torch.tensor(np.array(features[0:4]))
print('embedding batch:', XXX.shape)
rnn = nn.RNN(300, 100, batch_first=True)
_, hidden = rnn(XXX)
print('batch of hidden:', hidden.shape)
linear = nn.Linear(100, 1)
lin = torch.sigmoid(linear(hidden))
print('linear output:', lin.squeeze())

embedding batch: torch.Size([4, 30, 300])
batch of hidden: torch.Size([1, 4, 100])
linear output: tensor([0.4915, 0.4916, 0.4916, 0.4917], grad_fn=<SqueezeBackward0>)


In [35]:
model_RNN = RNN_net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_RNN.parameters(), lr=0.001)
model_RNN.to(device)

RNN_net(
  (rnn): RNN(300, 100, batch_first=True)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)

In [36]:
X_train, X_test, y_train, y_test = map(np.array, train_test_split(features, labels, test_size=0.25))

In [37]:
def train_one_epoch(model, in_data, targets, batch_size=4):
    for i in tqdm(range(0, len(X_train), batch_size)):
        batch_x = torch.tensor(X_train[i:i + batch_size]).to(device)
        batch_y = torch.tensor(y_train[i:i + batch_size]).float().to(device)
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
        gc.collect()
    print(loss)

In [38]:
for i in range(5):
  train_one_epoch(model_RNN, X_train, y_train)

100%|██████████████████████████████████████████████████████████████████████████████| 1204/1204 [02:06<00:00,  9.49it/s]


tensor(3.5843, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████████████████████████████████████████████████████████████████████████| 1204/1204 [02:09<00:00,  9.30it/s]


tensor(3.5852, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████████████████████████████████████████████████████████████████████████| 1204/1204 [02:10<00:00,  9.20it/s]


tensor(3.6447, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████████████████████████████████████████████████████████████████████████| 1204/1204 [02:09<00:00,  9.33it/s]


tensor(3.5824, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████████████████████████████████████████████████████████████████████████| 1204/1204 [02:07<00:00,  9.47it/s]

tensor(3.6536, device='cuda:0', grad_fn=<DivBackward1>)





In [39]:
with torch.no_grad():
    output = model_RNN(torch.tensor(X_test).to(device))

output

tensor([4.8577e-04, 4.8577e-04, 4.8577e-04,  ..., 4.8577e-04, 9.9950e-01,
        9.9950e-01], device='cuda:0')

In [40]:
torch.tensor(y_test)

tensor([0, 1, 1,  ..., 0, 1, 1], dtype=torch.int32)

In [41]:
result = (output.cpu() > 0.5) == torch.tensor(y_test)
result

tensor([ True, False, False,  ...,  True,  True,  True])

In [42]:
result.sum().item() / len(result)

0.7514018691588785

### 6. LSTM 

In [43]:
class LSTM_net(nn.Module):

    def __init__(self):
        super(LSTM_net, self).__init__()
        self.lstm = nn.LSTM(300, 100, batch_first=True)
        self.linear = nn.Linear(100, 1)

    def forward(self, x):
        
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        
        out, hidden = self.lstm(x, hidden)
        output = torch.sigmoid(self.linear(hidden[1])).squeeze()
        return output
    
    def init_hidden(self, batch_size):
        hidden = (torch.zeros(1, batch_size, 100).to(device), 
                  torch.zeros(1, batch_size, 100).to(device))
        return hidden

In [44]:
model_LSTM = LSTM_net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_LSTM.parameters(), lr=0.001)
model_LSTM.to(device)

LSTM_net(
  (lstm): LSTM(300, 100, batch_first=True)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)

In [45]:
X_train, X_test, y_train, y_test = map(np.array, train_test_split(features, labels, test_size=0.25))

In [46]:
for i in range(5):
  train_one_epoch(model_LSTM, X_train, y_train)

100%|██████████████████████████████████████████████████████████████████████████████| 1204/1204 [02:07<00:00,  9.41it/s]


tensor(1.8997, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████████████████████████████████████████████████████████████████████████| 1204/1204 [02:11<00:00,  9.18it/s]


tensor(2.1017, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████████████████████████████████████████████████████████████████████████| 1204/1204 [02:11<00:00,  9.15it/s]


tensor(1.7957, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████████████████████████████████████████████████████████████████████████| 1204/1204 [02:09<00:00,  9.33it/s]


tensor(1.7925, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████████████████████████████████████████████████████████████████████████| 1204/1204 [02:09<00:00,  9.28it/s]

tensor(1.8253, device='cuda:0', grad_fn=<DivBackward1>)





In [47]:
with torch.no_grad():
    output = model_LSTM(torch.tensor(X_test).to(device))

output

tensor([1.2208e-03, 1.0000e+00, 9.9982e-01,  ..., 2.0860e-05, 9.9999e-01,
        6.7255e-01], device='cuda:0')

In [48]:
torch.tensor(y_test)

tensor([1, 1, 1,  ..., 1, 1, 0], dtype=torch.int32)

In [49]:
result = (output.cpu() > 0.5) == torch.tensor(y_test)
result

tensor([False,  True,  True,  ..., False,  True, False])

In [50]:
result.sum().item() / len(result)

0.881619937694704