### Translating and Comparing News Headlines: step by step  

#### Comparing headlines across languages is cumbersome; doing an automated sentiment analysis on a sampling used to be very complex before the advent of ChatGPT

In [1]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [2]:
import openai
import os
import requests
import bs4

In [3]:
os.environ['API_KEY'] = "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"

In [4]:
openai.api_key = os.getenv("API_KEY")

In [5]:
news_sites = {
    "chinese" : ("https://cn.chinadaily.com.cn", "div.Home_content_Item_Text h1 a"),
    "arabic": ("https://aljazeera.net", "h3.article-card__title")
}
print(news_sites["arabic"])

('https://aljazeera.net', 'h3.article-card__title')


In [6]:
def fetch_headlines():
    headlines_by_language = {}
    for language, (url, tag) in news_sites.items():
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text, 'lxml')
        headlines = [h.getText() for h in soup.select(tag)[:10]]
        headlines_by_language[language] = headlines
    return headlines_by_language
fetch_headlines()


{'chinese': ['欧洲专家：新年新目标，让“美丽中国”近在眼前',
  '驻华使节点赞《只此青绿》：仿佛置身画中',
  '架好听民声汇民智的“连心桥”——全国人大常委会一年来代表工作综述',
  '中国生态文明理念蕴含大智慧——访美国国家人文科学院院士小约翰·柯布',
  '近50团“登陆”观光  台旅游业者盼当局解除禁令',
  '“一国两制”实践开启新篇章——横琴封关运作侧记',
  '俄罗斯外长：西方国家已经向乌克兰派遣现役军官',
  '三个“非常完美”、1个首次、多个成果！点赞，神舟十七号乘组',
  '海南自贸港“零关税”政策下享惠货值超200亿元',
  '十四届全国人大二次会议将于3月4日12时举行新闻发布会'],
 'arabic': ['الحرب على غزة.. الاحتلال يستهدف نازحين برفح وأميركا تنفذ أول إنزال للمساعدات',
  'سرايا القدس ردا على نتنياهو: مسألة اليوم التالي بغزة تحددها المقاومة',
  'سرايا القدس ردا على نتنياهو: مسألة اليوم التالي بغزة تحددها المقاومة',
  'القسام تبث مشاهد استهداف دبابات إسرائيلية وتفخيخ كمائن بقوات خاصة',
  'غرق السفينة "روبيمار" بخليج عدن واليمن يحذر من كارثة بيئية بالبحر الأحمر',
  'اتهمت حماس بارتكاب العنف الجنسي.. فضيحة جديدة تهز "نيويورك تايمز"',
  'حزب الله يعلن مقتل 3 من عناصره ويهاجم مواقع عسكرية إسرائيلية',
  'إصابة مستوطن إسرائيلي في عملية طعن جنوبي الخليل',
  'واشنطن تبدأ أول إنزال جوي للمساعدات على غزة',
  'من السياسة إلى العمالة وصولا للمرتزقة.. لما

In [7]:
def translate_headlines(headlines_by_language):
    translated_headlines = {}
    for language, headlines in headlines_by_language.items():
        translated_headlines[language] = []
        for headline in headlines:
            response = openai.Completion.create(
                model="gpt-3.5-turbo-instruct",
                prompt=f"Translate the following {language} headline into English: \"{headline}\"",
                temperature=0.3,
                max_tokens=60
            )
            translated_headline = response.choices[0].text.strip()
            translated_headlines[language].append(translated_headline)
    return translated_headlines
headlines_by_language = fetch_headlines()
translated_headlines = translate_headlines(headlines_by_language)

In [8]:
print(translated_headlines)

{'chinese': ['"European Experts: New Year, New Goals, Bringing "Beautiful China" Within Reach"', '"Resident Ambassador Praises "Only This Green": As If Being in a Painting"', '"Building a \'Bridge of Hearts\' that Listens to the Voices of the People and Gathers their Wisdom - A Summary of the National People\'s Congress Standing Committee\'s Representative Work in the Past Year"', '"China\'s Ecological Civilization Concept Embodies Great Wisdom - Interview with American Academy of Humanities and Sciences Academician John Coble"', '"Nearly 50 Tour Groups "Land" for Sightseeing - Taiwanese Tourism Industry Hopes Authorities Will Lift Ban"', '"New Chapter Begins in the Practice of "One Country, Two Systems" - A Side Note on the Operation of the Hengqin Port"', '"Russian Foreign Minister: Western countries have dispatched active duty officers to Ukraine"', '"Three "Perfect" Achievements, 1 First, Multiple Accomplishments! Thumbs Up for the Crew of Shenzhou 17"', '"Hainan Free Trade Port Be

In [9]:
# from transformers import pipeline

# def analyze_sentiment(translated_headlines):
#     sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
#     sentiment_results = {}
    
#     for language, headlines in translated_headlines.items():
#         language_results = []
#         for headline in headlines:
#             sentiment = sentiment_pipeline(headline)
#             sentiment_result = {
#                 'text': headline,
#                 'label': sentiment[0]['label'],
#                 'score': sentiment[0]['score']
#             }
#             language_results.append(sentiment_result)
#         sentiment_results[language] = language_results
    
#     return sentiment_results

# Example usage:
# translated_headlines = translate_headlines(headlines_by_language)
# sentiment_analysis_results = analyze_sentiment(translated_headlines)


In [23]:
from transformers import pipeline

def analyze_sentiment_and_print(translated_headlines):
    sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
    
    for language, headlines in translated_headlines.items():
        # print(f"Language: {language}")
        for headline in headlines:
            sentiment = sentiment_pipeline(headline)
            print(f"Language: {language}")
            print(f"Headline: '{headline}'")
            print(f"Label: {sentiment[0]['label']}, Score: {sentiment[0]['score']:.2f}")
            print("---")  # Separator for readability

# Assuming translated_headlines is your dictionary of translated headlines
analyze_sentiment_and_print(translated_headlines)


Language: chinese
Language: chinese
Headline: '"European Experts: New Year, New Goals, Bringing "Beautiful China" Within Reach"'
Label: positive, Score: 0.81
---
Language: chinese
Headline: '"Resident Ambassador Praises "Only This Green": As If Being in a Painting"'
Label: neutral, Score: 0.56
---
Language: chinese
Headline: '"Building a 'Bridge of Hearts' that Listens to the Voices of the People and Gathers their Wisdom - A Summary of the National People's Congress Standing Committee's Representative Work in the Past Year"'
Label: neutral, Score: 0.50
---
Language: chinese
Headline: '"China's Ecological Civilization Concept Embodies Great Wisdom - Interview with American Academy of Humanities and Sciences Academician John Coble"'
Label: neutral, Score: 0.46
---
Language: chinese
Headline: '"Nearly 50 Tour Groups "Land" for Sightseeing - Taiwanese Tourism Industry Hopes Authorities Will Lift Ban"'
Label: neutral, Score: 0.59
---
Language: chinese
Headline: '"New Chapter Begins in the P

In [11]:
#turn each set of translated headlines into a dataframe that can be saved as CSV  
#consider AUTOMATING the scraping and translating every hour (GitHub Actions)
#visualize 

In [17]:
results = []
# for sentences in df:
for index, row in edf.iterrows():
    dict = {}
    sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
    data = index 
    rating = sentiment_pipeline(data)
    print(index)
    print(rating)
    #df['score'] = rating
    # print(df['score'])

    dict['text'] = index
    dict['label'] = rating[0]['label']
    dict['score'] = rating[0]['score']
    results.append(dict)
print("translated: ", results) 

NameError: name 'edf' is not defined

In [28]:
from transformers import pipeline

def analyze_sentiment_and_get_results(translated_headlines):
  """
  Analyzes sentiment for each headline in the translated_headlines dictionary and returns a list of dictionaries.

  Args:
      translated_headlines: A dictionary containing translated headlines, where keys are languages and values are lists of headlines.

  Returns:
      A list of dictionaries, where each dictionary holds the text, label, and score for a headline.
  """
  sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
  results = []

  for language, headlines in translated_headlines.items():
    for headline in headlines:
      sentiment = sentiment_pipeline(headline)
      # Extract label and score

      label = sentiment[0]['label']
      score = sentiment[0]['score']

      # Create a dictionary for each headline and its analysis
      result = {
        'language': language,
          "text": headline,
          "label": label,
          "score": score
      }
      results.append(result)

  return results

# Assuming translated_headlines is your dictionary of translated headlines
results = analyze_sentiment_and_get_results(translated_headlines)
print(results)

[{'language': 'chinese', 'text': '"European Experts: New Year, New Goals, Bringing "Beautiful China" Within Reach"', 'label': 'positive', 'score': 0.8094611763954163}, {'language': 'chinese', 'text': '"Resident Ambassador Praises "Only This Green": As If Being in a Painting"', 'label': 'neutral', 'score': 0.5630508661270142}, {'language': 'chinese', 'text': '"Building a \'Bridge of Hearts\' that Listens to the Voices of the People and Gathers their Wisdom - A Summary of the National People\'s Congress Standing Committee\'s Representative Work in the Past Year"', 'label': 'neutral', 'score': 0.5004069209098816}, {'language': 'chinese', 'text': '"China\'s Ecological Civilization Concept Embodies Great Wisdom - Interview with American Academy of Humanities and Sciences Academician John Coble"', 'label': 'neutral', 'score': 0.4616212844848633}, {'language': 'chinese', 'text': '"Nearly 50 Tour Groups "Land" for Sightseeing - Taiwanese Tourism Industry Hopes Authorities Will Lift Ban"', 'lab

In [20]:
type(results)


list

In [29]:
import pandas as pd
resultsdf = pd.DataFrame(results)
resultsdf 

Unnamed: 0,language,text,label,score
0,chinese,"""European Experts: New Year, New Goals, Bringi...",positive,0.809461
1,chinese,"""Resident Ambassador Praises ""Only This Green""...",neutral,0.563051
2,chinese,"""Building a 'Bridge of Hearts' that Listens to...",neutral,0.500407
3,chinese,"""China's Ecological Civilization Concept Embod...",neutral,0.461621
4,chinese,"""Nearly 50 Tour Groups ""Land"" for Sightseeing ...",neutral,0.586214
5,chinese,"""New Chapter Begins in the Practice of ""One Co...",neutral,0.795426
6,chinese,"""Russian Foreign Minister: Western countries h...",neutral,0.517621
7,chinese,"""Three ""Perfect"" Achievements, 1 First, Multip...",positive,0.908627
8,chinese,"""Hainan Free Trade Port Benefits Over 20 Billi...",neutral,0.61454
9,chinese,The 14th National People's Congress will hold ...,neutral,0.816515


In [30]:
print(resultsdf)

   language                                               text     label  \
0   chinese  "European Experts: New Year, New Goals, Bringi...  positive   
1   chinese  "Resident Ambassador Praises "Only This Green"...   neutral   
2   chinese  "Building a 'Bridge of Hearts' that Listens to...   neutral   
3   chinese  "China's Ecological Civilization Concept Embod...   neutral   
4   chinese  "Nearly 50 Tour Groups "Land" for Sightseeing ...   neutral   
5   chinese  "New Chapter Begins in the Practice of "One Co...   neutral   
6   chinese  "Russian Foreign Minister: Western countries h...   neutral   
7   chinese  "Three "Perfect" Achievements, 1 First, Multip...  positive   
8   chinese  "Hainan Free Trade Port Benefits Over 20 Billi...   neutral   
9   chinese  The 14th National People's Congress will hold ...   neutral   
10   arabic  "War on Gaza.. Occupation Targets Displaced Pe...  negative   
11   arabic  "Al-Quds Brigades in Response to Netanyahu: Th...   neutral   
12   arabic 

In [12]:
# What I'm going to work next: 
# Find free websites in German, French, Spanigh, Russian
# Create openai prompt to find tags
# Test the prompts 

In [13]:
# What I'm going to work next: 
# Find free websites in German, French, Spanish, Russian
# Translate all the main headlines 
# Do a sentiment analysis on the headlines using a LLM
# calculate the avg and median scores for each language
#COMPARE the total score for each language's website 

In [14]:
#FIND PYTHON library to visualize DIRECTLY on jupyter notebook