### Translating and Comparing News Headlines: step by step  

#### Comparing headlines across languages is cumbersome; doing an automated sentiment analysis on a sampling used to be very complex before the advent of ChatGPT

In [18]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [19]:
import openai
import os
import requests
import bs4

In [20]:
os.environ['API_KEY'] = "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"

In [21]:
openai.api_key = os.getenv("API_KEY")

In [22]:
news_sites = {
    "chinese" : ("https://cn.chinadaily.com.cn", "div.Home_content_Item_Text h1 a"),
    "arabic": ("https://aljazeera.net", "h3.article-card__title")
}
print(news_sites["arabic"])

('https://aljazeera.net', 'h3.article-card__title')


In [23]:
def fetch_headlines():
    headlines_by_language = {}
    for language, (url, tag) in news_sites.items():
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text, 'lxml')
        headlines = [h.getText() for h in soup.select(tag)[:10]]
        headlines_by_language[language] = headlines
    return headlines_by_language
fetch_headlines()


{'chinese': ['世行：2024年新兴经济体应加大投资， 中国需要转变增长模式，以消费促增长',
  '新型高强轻质镁锂合金材料工程化应用成功',
  '中国航空学会成立六十周年纪念活动在廊坊临空区举行',
  '和评理｜忽视中美科技合作互利本质：美国“偏执恐华”心态该休矣',
  '国防部：中国军队坚决反制南海侵权挑衅',
  '国防部：中方协助查找美军二战期间在华失踪人员遗骸',
  '我国设立上海东方枢纽国际商务合作区',
  '商务部：继续推动外资24条各项举措加快落地见效',
  '一边是高溢价 一边是高报损——面包门店浪费现象调查',
  '中方坚决反对日本启动第四批核污染水排海'],
 'arabic': ['الحرب على غزة.. أكثر من 100 شهيد في “مجزرة الطحين” وفلسطيني ينفذ عملية بالضفة المحتلة',
  '112 شهيدا ومئات الجرحى بمجزرة الطحين بغزة وبايدن يتوقع تأثيرها على ...',
  '112 شهيدا ومئات الجرحى بمجزرة الطحين بغزة وبايدن يتوقع تأثيرها على ...',
  '"غدا سنخبركم مصيرهم".. القسام تعيد بث مشاهد لأسرى أعلنت فقدان الاتصال ...',
  'حزب الله يستهدف تجمعات للاحتلال وحديث أميركي عن عملية برية إسرائيلية ...',
  'مقتل مستوطنين إسرائيليين في إطلاق نار قرب مستوطنة عيلي جنوب نابلس',
  'البرهان يعرض على السيسي رؤيته لإنهاء الحرب في السودان',
  'غارات أميركية وبريطانية على اليمن والحوثي يتوعد بمفاجآت',
  'عمليات إنزال إغاثية من 5 دول عربية على غزة',
  'نقطة عمياء.. هل يصلح الفلاسفة ما أفسده "

In [24]:
def translate_headlines(headlines_by_language):
    translated_headlines = {}
    for language, headlines in headlines_by_language.items():
        translated_headlines[language] = []
        for headline in headlines:
            response = openai.Completion.create(
                model="gpt-3.5-turbo-instruct",
                prompt=f"Translate the following {language} headline into English: \"{headline}\"",
                temperature=0.3,
                max_tokens=60
            )
            translated_headline = response.choices[0].text.strip()
            translated_headlines[language].append(translated_headline)
    return translated_headlines
headlines_by_language = fetch_headlines()
translated_headlines = translate_headlines(headlines_by_language)

In [11]:
print(translated_headlines)

{'chinese': ['"Kenyan Overseas Community Celebrates Lantern Festival"', '"Available for Check Starting Tomorrow! Starting from the 26th, Preliminary Exam Results for Graduate Entrance Exams Will Be Released Across the Country"', '"Online Chinese Festival · Lantern Festival | Hello China Lantern Festival Highlights"', '"How to attract foreign investment in the new year? Implementation of a series of measures to stabilize foreign investment across the country"', '"Gazing at the Back of a Dynasty in the Yin Ruins"', '"Chinese Athletes Win Dota Championship at "Future Games" in Russia"', '"Republican Primaries: Trump Wins Again"', '"Paralyzed Patients Control Cursor through Brain-Computer Interface"', '"Fujian Coast Guard Organizes Fleet to Conduct Law Enforcement Patrols in Waters Near Kinmen"', '"Multiple regions fully focus on stabilizing employment and retaining workers"'], 'arabic': ['"War on Gaza... Ongoing Massacres and Disagreement over the Prisoner Exchange Deal"', '"UN Resolution

In [26]:
# from transformers import pipeline

# def analyze_sentiment(translated_headlines):
#     sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
#     sentiment_results = {}
    
#     for language, headlines in translated_headlines.items():
#         language_results = []
#         for headline in headlines:
#             sentiment = sentiment_pipeline(headline)
#             sentiment_result = {
#                 'text': headline,
#                 'label': sentiment[0]['label'],
#                 'score': sentiment[0]['score']
#             }
#             language_results.append(sentiment_result)
#         sentiment_results[language] = language_results
    
#     return sentiment_results

# Example usage:
# translated_headlines = translate_headlines(headlines_by_language)
# sentiment_analysis_results = analyze_sentiment(translated_headlines)


In [16]:
from transformers import pipeline

def analyze_sentiment_and_print(translated_headlines):
    sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
    
    for language, headlines in translated_headlines.items():
        print(f"Language: {language}")
        for headline in headlines:
            sentiment = sentiment_pipeline(headline)
            print(f"Headline: '{headline}'")
            print(f"Label: {sentiment[0]['label']}, Score: {sentiment[0]['score']:.2f}")
            print("---")  # Separator for readability

# Assuming translated_headlines is your dictionary of translated headlines
analyze_sentiment_and_print(translated_headlines)


Language: chinese
Headline: '"Kenyan Overseas Community Celebrates Lantern Festival"'
Label: neutral, Score: 0.62
---
Headline: '"Available for Check Starting Tomorrow! Starting from the 26th, Preliminary Exam Results for Graduate Entrance Exams Will Be Released Across the Country"'
Label: neutral, Score: 0.84
---
Headline: '"Online Chinese Festival · Lantern Festival | Hello China Lantern Festival Highlights"'
Label: neutral, Score: 0.60
---
Headline: '"How to attract foreign investment in the new year? Implementation of a series of measures to stabilize foreign investment across the country"'
Label: positive, Score: 0.50
---
Headline: '"Gazing at the Back of a Dynasty in the Yin Ruins"'
Label: negative, Score: 0.90
---
Headline: '"Chinese Athletes Win Dota Championship at "Future Games" in Russia"'
Label: positive, Score: 0.51
---
Headline: '"Republican Primaries: Trump Wins Again"'
Label: positive, Score: 0.48
---
Headline: '"Paralyzed Patients Control Cursor through Brain-Computer 

In [17]:
#turn each set of translated headlines into a dataframe that can be saved as CSV  
#consider AUTOMATING the scraping and translating every hour (GitHub Actions)
#visualize 

In [17]:
# What I'm going to work next: 
# Find free websites in German, French, Spanigh, Russian
# Create openai prompt to find tags
# Test the prompts 

In [18]:
# What I'm going to work next: 
# Find free websites in German, French, Spanish, Russian
# Translate all the main headlines 
# Do a sentiment analysis on the headlines using a LLM
# calculate the avg and median scores for each language
#COMPARE the total score for each language's website 

In [None]:
#FIND PYTHON library to visualize DIRECTLY on jupyter notebook