In [1]:
import sys
from pathlib import Path

import pandas as pd


sys.path.append(str(Path().resolve().parent))
from src.definitions import RAW_DATA_FOLDER

In [24]:
df = pd.read_parquet(RAW_DATA_FOLDER / "techniques-classification.parquet")

df.head(5)

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,–ù–æ–≤–∏–π –æ–≥–ª—è–¥ –º–∞–ø–∏ DeepState –≤—ñ–¥ —Ä–æ—Å—ñ–π—Å—å–∫–æ–≥–æ –≤—ñ–π...,uk,True,"[euphoria, loaded_language]",
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,–ù–µ–¥–∞–≤–Ω–æ 95 –∫–≤–∞—Ä—Ç–∞–ª –∂—ë—Å—Ç–∫–æ –ø–æ–≥–ª—É–º–∏–ª—Å—è –Ω–∞–¥ —Ä—É—Å—Å–∫...,ru,True,"[loaded_language, cherry_picking]",
2,e6a427f1-211f-405f-bd8b-70798458d656,ü§©\n–¢–∏–º —á–∞—Å–æ–º –π–¥–µ –µ–≤–∞–∫—É–∞—Ü—ñ—è –ë—î–ª–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ –∞–≤—Ç–æ...,uk,True,"[loaded_language, euphoria]",
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,–í –£–∫—Ä–∞—ó–Ω—ñ –Ω–∞–π–±–ª–∏–∂—á–∏–º —á–∞—Å–æ–º –º–∞—é—Ç—å –Ω–∞–º—ñ—Ä –ø–æ—Å–∏–ª–∏—Ç...,uk,False,,
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"–†–∞—Å—á—ë—Ç—ã 122-–º–º –°–ê–£ 2–°1 ""–ì–≤–æ–∑–¥–∏–∫–∞"" 132-–π –±—Ä–∏–≥–∞–¥...",ru,True,[loaded_language],


In [25]:
df_uk = df[df["lang"] == "uk"]
df_ru = df[df["lang"] == "ru"]


In [None]:
df_uk = df_uk[["id", "content"]].head(20)
df_uk

## MarianMT

In [None]:
from transformers import MarianMTModel, MarianTokenizer

# –ó–∞–≤–∞–Ω—Ç–∞–∂—É—î–º–æ –º–æ–¥–µ–ª—å —Ç–∞ —Ç–æ–∫–µ–Ω—ñ–∑–∞—Ç–æ—Ä
model_name = "Helsinki-NLP/opus-mt-uk-ru"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# –§—É–Ω–∫—Ü—ñ—è –¥–ª—è –ø–µ—Ä–µ–∫–ª–∞–¥—É
def translate_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)


# –ü–µ—Ä–µ–∫–ª–∞–¥–∞—î–º–æ —Ç–µ–∫—Å—Ç–∏
df_uk["translated_content"] = df_uk["content"].apply(lambda x: translate_text(x, model, tokenizer))

df_uk.to_excel("df-uk-MarianMT.xlsx", index=False)


## Google API

In [30]:
from deep_translator import GoogleTranslator

# –Ü–Ω—ñ—Ü—ñ–∞–ª—ñ–∑–∞—Ü—ñ—è –ø–µ—Ä–µ–∫–ª–∞–¥–∞—á–∞
translator = GoogleTranslator(source="uk", target="ru")

# –§—É–Ω–∫—Ü—ñ—è –¥–ª—è –ø–µ—Ä–µ–∫–ª–∞–¥—É
def translate_text(text):
    return translator.translate(text)

df_uk["translated_content"] = df_uk["content"].apply(translate_text)

df_uk.to_excel("df-uk-Google.xlsx", index=False)

## NLLB-200 from META

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# –í–∏–∑–Ω–∞—á–∞—î–º–æ –ø—Ä–∏—Å—Ç—Ä—ñ–π
device = "cuda" if torch.cuda.is_available() else "cpu"

# –ó–∞–≤–∞–Ω—Ç–∞–∂—É—î–º–æ –º–æ–¥–µ–ª—å —Ç–∞ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# –§—É–Ω–∫—Ü—ñ—è –¥–ª—è –ø–µ—Ä–µ–∫–ª–∞–¥—É –∑ —É–∫—Ä —Ä–æ—Å
def translate_text_nllb(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["rus_Cyrl"])
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# –ü–µ—Ä–µ–∫–ª–∞–¥ —É–∫—Ä–∞—ó–Ω—Å—å–∫–æ–≥–æ —Ç–µ–∫—Å—Ç—É –Ω–∞ —Ä–æ—Å—ñ–π—Å—å–∫—É
df_uk["translated_content"] = df_uk["content"].apply(translate_text_nllb)

# –ó–±–µ—Ä–µ–∂–µ–Ω–Ω—è –≤ Excel
df_uk.to_excel("df-uk-NLLB-200.xlsx", index=False)



## Yandex Translate API

In [38]:
import requests

YANDEX_API_KEY = "y0__xDlzqj6BxjB3RMgkeXJqRJHVWVXWpDoscZgDfTZirB4c33k4Q"

def translate_text_yandex(text):
    url = "https://translate.api.cloud.yandex.net/translate/v2/translate"
    headers = {"Authorization": f"Api-Key {YANDEX_API_KEY}"}
    data = {
        "folder_id": "your_folder_id",
        "texts": [text],
        "sourceLanguageCode": "uk",
        "targetLanguageCode": "ru"
    }
    response = requests.post(url, json=data, headers=headers)
    return response.json()["translations"][0]["text"]

df_uk["translated_content"] = df_uk["content"].apply(translate_text_yandex)


KeyError: 'translations'

In [39]:
import requests
import json

# –í–∞—à OAuth —Ç–æ–∫–µ–Ω
oauth_token = "y0__xDlzqj6BxjB3RMgkeXJqRJHVWVXWpDoscZgDfTZirB4c33k4Q"

# URL –¥–ª—è –∑–∞–ø–∏—Ç—É
url = "https://iam.api.cloud.yandex.net/iam/v1/tokens"

# –î–∞–Ω—ñ –¥–ª—è –∑–∞–ø–∏—Ç—É
data = {
    "yandexPassportOauthToken": oauth_token
}

# –í–∏–∫–æ–Ω–∞–Ω–Ω—è POST-–∑–∞–ø–∏—Ç—É
response = requests.post(url, json=data)

# –ü–µ—Ä–µ–≤—ñ—Ä–∫–∞ —Å—Ç–∞—Ç—É—Å—É –≤—ñ–¥–ø–æ–≤—ñ–¥—ñ
if response.status_code == 200:
    print("–¢–æ–∫–µ–Ω –ø—Ä–∞–≤–∏–ª—å–Ω–∏–π!")
    # –ú–æ–∂–Ω–∞ –æ—Ç—Ä–∏–º–∞—Ç–∏ –¥–æ—Å—Ç—É–ø –¥–æ –¥–∞–Ω–∏—Ö –≤—ñ–¥–ø–æ–≤—ñ–¥—ñ
    token_info = response.json()
    print("–Ü–Ω—Ñ–æ—Ä–º–∞—Ü—ñ—è –ø—Ä–æ —Ç–æ–∫–µ–Ω:", token_info)
else:
    print("–¢–æ–∫–µ–Ω –Ω–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∏–π –∞–±–æ —Å—Ç–∞–ª–∞—Å—è –ø–æ–º–∏–ª–∫–∞.")
    print("–°—Ç–∞—Ç—É—Å –∫–æ–¥:", response.status_code)
    print("–í—ñ–¥–ø–æ–≤—ñ–¥—å:", response.text)


–¢–æ–∫–µ–Ω –Ω–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∏–π –∞–±–æ —Å—Ç–∞–ª–∞—Å—è –ø–æ–º–∏–ª–∫–∞.
–°—Ç–∞—Ç—É—Å –∫–æ–¥: 401
–í—ñ–¥–ø–æ–≤—ñ–¥—å: {
 "code": 16,
 "message": "User does not registered in Yandex.cloud",
 "details": [
  {
   "@type": "type.googleapis.com/google.rpc.RequestInfo",
   "requestId": "12fa304c-40c1-459e-9272-a8fd314888a2"
  }
 ]
}

