In [42]:
import pandas as pd
import numpy as np
import re
import string
import nltk

from twython import Twython

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score

from IPython.display import clear_output

import plotly.express as px

import matplotlib.pyplot as plt

from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vladimirkulichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vladimirkulichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data

In [3]:
df = pd.read_csv('tweets_labeled_final.csv')

In [5]:
df = df[df.columns[2:]]

In [6]:
df

Unnamed: 0,id,id_str,created_at,full_text,name,screen_name,location,followers_count,geo,place,retweet_count,favorite_count,operation_attitude
0,1519654976116727808,1519654976116727808,Thu Apr 28 12:29:14 +0000 2022,RT @holodmedia: Российские войска вывезли из м...,badkid 💙💛,sergekills,born in Ukraine,671,,NoData,21,0,
1,1519655537973071873,1519655537973071873,Thu Apr 28 12:31:28 +0000 2022,ДНР.\n\n⚡️ Число погибших в результате обстрел...,Алексей Толстый,AlexeyNemo,ЛНР. Антрацит,4218,,NoData,27,14,-1.0
2,1519657096010162177,1519657096010162177,Thu Apr 28 12:37:40 +0000 2022,"RT @gABmQNyYKwFpuBX: А этот сраный ""карманный""...",Соня Травникова Z,XXX68XXX,Луганск ЛНР,892,,NoData,43,0,
3,1519657291502522374,1519657291502522374,Thu Apr 28 12:38:26 +0000 2022,@GazetaRu Орки ДНР бомбят Донецк. Ничего нового.,𝐶𝑂𝐷𝐸 𝐷𝐴 𝑊𝐼𝑁𝐶𝐼,VinciGatto,Фашистская Россия,1064,,NoData,0,0,
4,1519657390823641093,1519657390823641093,Thu Apr 28 12:38:50 +0000 2022,@taraban_alex @GazetaRu Профессия врача не поз...,Юрий ВОЛКОВ,YuV1960,,10,,NoData,0,0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14205,1523090259894013952,1523090259894013952,Sat May 07 23:59:50 +0000 2022,"@taygainfo Это спецоперация такая, всё идёт по...",Татьяна Пастернак,PasternakTat,,4,,NoData,0,0,-1.0
14206,1523090266827096064,1523090266827096064,Sat May 07 23:59:51 +0000 2022,"RT @KeepPeaceNoWar: ""Россия своих не бросает""....",РВК ПНХ!,Svitlana861,,51,,NoData,31,0,
14207,1523090268135645185,1523090268135645185,Sat May 07 23:59:52 +0000 2022,RT @SergejTalk: А почему Грета Тунберг не звон...,Ghavana,ghavana1,,140,,NoData,502,0,
14208,1523090280827785220,1523090280827785220,Sat May 07 23:59:55 +0000 2022,RT @_sashayasha: «Оккупация Украины не входит ...,Адвокат Рaиса Литвиненко,litvinenko33,"Химки, Россия",968,,NoData,312,0,


# Preparing tweets

## We clean from unnecessary information

In [7]:
df = df[df['full_text'].notna()]

Remove all mentions of users (@***)

In [8]:
df['text_clean'] = [re.sub(r'@\w+', r'', x) for x in df['full_text']]

Remove line breaks

In [9]:
df['text_clean'] = [x.replace('\n', '') for x in df['text_clean']]

Converting everything to lowercase

In [10]:
df['text_clean'] = [x.lower() for x in df['text_clean']]

Remove mentions of rt

In [11]:
df['text_clean'] = [re.sub(r'rt :', r'', x) for x in df['text_clean']]
df['text_clean'] = [re.sub(r'rt', r'', x) for x in df['text_clean']]

Remove all links

In [12]:
df['text_clean'] = [re.sub(r'http\S+', r'', x) for x in df['text_clean']]

Remove emoji

In [13]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

In [14]:
df['text_clean'] = [emoji_pattern.sub(r'', x) for x in df['text_clean']]

In [15]:
df['text_clean'] = [re.sub(r'❗️', r'', x) for x in df['text_clean']]
df['text_clean'] = [re.sub(r'⚔️', r'', x) for x in df['text_clean']]
df['text_clean'] = [re.sub(r'⚡', r'', x) for x in df['text_clean']]

Remove extra spaces

In [16]:
df['text_clean'] = [x.strip() for x in df['text_clean']]

Remove partial punctuation

In [17]:
df['text_clean'] = [re.sub(r'"', r'', x) for x in df['text_clean']]
df['text_clean'] = [re.sub(r'«', r'', x) for x in df['text_clean']]
df['text_clean'] = [re.sub(r'»', r'', x) for x in df['text_clean']]
df['text_clean'] = [re.sub(r'!', r'', x) for x in df['text_clean']]
#df['text_clean'] = [re.sub(r'...', r'', x) for x in df['text_clean']]

Remove the numbers

In [18]:
df['text_clean'] = [re.sub('\d+', '', x) for x in df['text_clean']]

## Tokenization

In [19]:
russian_stop_words = stopwords.words("russian")
snowball = SnowballStemmer(language='russian')


def tokenize_text(x):
    tokens = word_tokenize(x, language='russian')
    tokens_no_punkt = [i for i in tokens if i not in string.punctuation]
    tokens_no_stopwords = [i for i in tokens_no_punkt if i not in russian_stop_words]
    stemmed_tokens = [snowball.stem(i) for i in tokens_no_stopwords]
    return stemmed_tokens

In [20]:
df['text_tokenized'] = [tokenize_text(x) for x in df['text_clean']]

# Attitude prediction model in a special military operation

In [32]:
def print_metrics(y_test, y_pred):
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'ROC-AUC: {roc_auc_score(y_test, y_pred)}')

In [33]:
X = df.loc[df['operation_attitude'].isin([1,-1]), 'text_clean']
y = df.loc[df['operation_attitude'].isin([1,-1]), 'operation_attitude']
y = y.map({-1:0,1:1})

Let's take different partitions several times and look at the metrics on each partition

In [46]:
accuracy_list = []
precision_list = []
recall_list = []
rocauc_list = []

for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    shuffle=True,
                                                    test_size=0.2,
                                                    stratify=y)
    
    print(f'Iteration {i}')
    print('-------------------')
    
    pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer = lambda x: tokenize_text(x))),
    ("model", GradientBoostingClassifier(n_estimators=5000, learning_rate=0.05))
    ])
    
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    
    accuracy_list.append(accuracy_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    rocauc_list.append(roc_auc_score(y_test, y_pred))
    
    print_metrics(y_test, y_pred)
    print('')
    
print('')
print(f'Mean accuracy: {round(np.mean(accuracy_list), 2)}')
print(f'Mean precision: {round(np.mean(precision_list), 2)}')
print(f'Mean recall: {round(np.mean(recall_list), 2)}')
print(f'Mean ROC-AUC: {round(np.mean(rocauc_list), 2)}')

Iteration 0
-------------------
Accuracy: 0.608433734939759
Precision: 0.3953488372093023
Recall: 0.30357142857142855
ROC-AUC: 0.5336038961038961

Iteration 1
-------------------
Accuracy: 0.5783132530120482
Precision: 0.36538461538461536
Recall: 0.3392857142857143
ROC-AUC: 0.5196428571428572

Iteration 2
-------------------
Accuracy: 0.6204819277108434
Precision: 0.41025641025641024
Recall: 0.2857142857142857
ROC-AUC: 0.5383116883116882

Iteration 3
-------------------
Accuracy: 0.6024096385542169
Precision: 0.375
Recall: 0.26785714285714285
ROC-AUC: 0.5202922077922078

Iteration 4
-------------------
Accuracy: 0.5843373493975904
Precision: 0.3142857142857143
Recall: 0.19642857142857142
ROC-AUC: 0.4891233766233766


Mean accuracy: 0.6
Mean precision: 0.37
Mean recall: 0.28
Mean ROC-AUC: 0.52


In [41]:
accuracy_list = []
precision_list = []
recall_list = []
rocauc_list = []

for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    shuffle=True,
                                                    test_size=0.2,
                                                    stratify=y)
    
    print(f'Iteration {i}')
    print('-------------------')
    
    pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer = lambda x: tokenize_text(x))),
    ("model", RandomForestClassifier(n_estimators=5000, 
                                     n_jobs=-1))
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_pred_proba = pipeline.predict_proba(X_test)
    y_pred = pipeline.predict(X_test)
    
    accuracy_list.append(accuracy_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    rocauc_list.append(roc_auc_score(y_test, y_pred))
    
    print_metrics(y_test, y_pred)
    print('')
    
print('')
print(f'Mean accuracy: {round(np.mean(accuracy_list), 2)}')
print(f'Mean precision: {round(np.mean(precision_list), 2)}')
print(f'Mean recall: {round(np.mean(recall_list), 2)}')
print(f'Mean ROC-AUC: {round(np.mean(rocauc_list), 2)}')

Iteration 0
-------------------
Accuracy: 0.6566265060240963
Precision: 0.47368421052631576
Recall: 0.16071428571428573
ROC-AUC: 0.5349025974025974

Iteration 1
-------------------
Accuracy: 0.6686746987951807
Precision: 0.5333333333333333
Recall: 0.14285714285714285
ROC-AUC: 0.5396103896103895

Iteration 2
-------------------
Accuracy: 0.6686746987951807
Precision: 0.5454545454545454
Recall: 0.10714285714285714
ROC-AUC: 0.5308441558441559

Iteration 3
-------------------
Accuracy: 0.6144578313253012
Precision: 0.2777777777777778
Recall: 0.08928571428571429
ROC-AUC: 0.485551948051948

Iteration 4
-------------------
Accuracy: 0.6686746987951807
Precision: 0.5454545454545454
Recall: 0.10714285714285714
ROC-AUC: 0.5308441558441559


Mean accuracy: 0.66
Mean precision: 0.48
Mean recall: 0.12
Mean ROC-AUC: 0.52


Train the model on all data and apply to our table

In [47]:
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer = lambda x: tokenize_text(x))),
    ("model", RandomForestClassifier(n_estimators=5000, 
                                     n_jobs=-1))
    ])

In [97]:
X = df.loc[df['operation_attitude'].isin([1,-1]), 'text_clean']
y = df.loc[df['operation_attitude'].isin([1,-1]), 'operation_attitude']
y = y.map({-1:0,1:1})

In [98]:
pipeline.fit(X, y)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x7f82f4e70940>)),
                ('model', GradientBoostingClassifier(n_estimators=5000))])

Predicting values based on probability

In [99]:
y_pred_proba = pipeline.predict_proba(df['text_clean'])

In [100]:
y_pred = pipeline.predict(df['text_clean'])

In [101]:
pd.Series(y_pred).value_counts()

0    10413
1     3797
dtype: int64

In [104]:
df['opinion_prediction'] = np.NaN
df.loc[y_pred_proba[:, 1] > 0.5, 'opinion_prediction'] = 1
df.loc[y_pred_proba[:, 0] > 0.8, 'opinion_prediction'] = 0
df['opinion_prediction'] = df['opinion_prediction'].fillna(999)

In [105]:
df['opinion_prediction'].value_counts(dropna=False)

0.0      8985
1.0      3797
999.0    1428
Name: opinion_prediction, dtype: int64

In [112]:
df.to_csv('tweets_ml_predicted.csv')