### Translating and Comparing News Headlines: step by step  

#### Comparing headlines across languages is cumbersome; doing an automated sentiment analysis on a sampling used to be very complex before the advent of ChatGPT

In [1]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown
from dotenv import load_dotenv

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [2]:
import openai
import os
import requests
import bs4

In [3]:
load_dotenv()
api_key = os.environ.get("API_KEY")

In [4]:
openai.api_key = os.getenv("API_KEY")

In [5]:
news_sites = {
    "chinese" : ("https://cn.chinadaily.com.cn", "div.Home_content_Item_Text h1 a"),
    "arabic": ("https://aljazeera.net", "h3.article-card__title")
}
print(news_sites["arabic"])

('https://aljazeera.net', 'h3.article-card__title')


In [6]:
def fetch_headlines():
    headlines_by_language = {}
    for language, (url, tag) in news_sites.items():
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text, 'lxml')
        headlines = [h.getText() for h in soup.select(tag)[:10]]
        headlines_by_language[language] = headlines
    return headlines_by_language
fetch_headlines()


{'chinese': ['专题：全国政协十四届二次会议新闻发布会',
  '十四届全国人大二次会议将于3月4日12时举行新闻发布会',
  '多国人士关注中国两会 期待中国为全球发展贡献更多动力',
  '住陕全国政协委员肩负三秦儿女重托 满怀履职热情赴京参会',
  '把握时代机遇 共绘同心圆 全国政协委员积极履职尽责、建言献策',
  '刘结一：全国政协将继续聚焦中国式现代化目标任务献计出力',
  '今年两会首场发布会，要点来了→',
  '全国爱耳日 ：6问6答带你了解如何“爱耳”',
  '刘结一：中国经济长期向好态势将持续巩固和增强',
  '全国政协十四届二次会议会期6天 将举办三场“委员通道”'],
 'arabic': ['الحرب على غزة.. مجازر طحين جديدة وحماس تتهم الاحتلال بالتخبط والارتباك',
  'شاهد.. ساعات الجمر الإسرائيلية يوم 7 أكتوبر',
  'شاهد.. ساعات الجمر الإسرائيلية يوم 7 أكتوبر',
  'حماس تؤكد جديتها بالتفاوض وأنباء عن مقاطعة إسرائيل لاجتماع القاهرة',
  'آخرها ما حدث بجنين.. لماذا يصر أمن السلطة على ملاحقة المقاومة بالضفة؟',
  'تلغراف: بن غفير ممنوع من اجتماعات استخباراتية بعد عدة تسريبات',
  'شاهد.. سرايا القدس تفجر دبابة وتستدرج قوات إسرائيلية لكمين في خان يونس',
  'مجزرتان في غزة بحق الباحثين عن الطحين والاحتلال يتكبد خسائر جديدة',
  '%10 من قتلى الاحتلال بغزة أميركيون.. معلومات تشعل مواقع التواصل',
  'منير العكش: مصير الهنود الحمر و”المعنى الإسرائيلي لأميركا”']}

In [7]:
def translate_headlines(headlines_by_language):
    translated_headlines = {}
    for language, headlines in headlines_by_language.items():
        translated_headlines[language] = []
        for headline in headlines:
            response = openai.Completion.create(
                model="gpt-3.5-turbo-instruct",
                prompt=f"Translate the following {language} headline into English: \"{headline}\"",
                temperature=0.3,
                max_tokens=60
            )
            translated_headline = response.choices[0].text.strip()
            translated_headlines[language].append(translated_headline)
    return translated_headlines
headlines_by_language = fetch_headlines()
translated_headlines = translate_headlines(headlines_by_language)

In [8]:
print(translated_headlines)

{'chinese': ['"Special Topic: Press Conference for the Second Session of the 14th National Committee of the Chinese People\'s Political Consultative Conference"', "The 14th National People's Congress will hold a press conference at 12:00 on March 4th.", '"International Community Pays Attention to China\'s Two Sessions, Awaits China to Contribute More Momentum to Global Development"', '"Shaanxi National Committee Members Shoulder the Responsibility of Representing the People of Shanxi and Head to Beijing with Enthusiasm to Attend the Conference"', '"Grasping the Opportunities of the Times, Drawing a Unified Circle Together - National Committee Members of the Chinese People\'s Political Consultative Conference Actively Fulfill their Duties and Offer Suggestions"', "Liu Jieyi: The National Committee of the Chinese People's Political Consultative Conference will continue to focus on the goal and task of China's modernization and provide suggestions and efforts.", '"Highlights of the First 

In [9]:
from transformers import pipeline

def analyze_sentiment_and_print(translated_headlines):
    sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
    
    for language, headlines in translated_headlines.items():
        # print(f"Language: {language}")
        for headline in headlines:
            sentiment = sentiment_pipeline(headline)
            print(f"Language: {language}")
            print(f"Headline: '{headline}'")
            print(f"Label: {sentiment[0]['label']}, Score: {sentiment[0]['score']:.2f}")
            print("---")  # Separator for readability
    

# Assuming translated_headlines is your dictionary of translated headlines
analyze_sentiment_and_print(translated_headlines)


Language: chinese
Headline: '"Special Topic: Press Conference for the Second Session of the 14th National Committee of the Chinese People's Political Consultative Conference"'
Label: neutral, Score: 0.91
---
Language: chinese
Headline: 'The 14th National People's Congress will hold a press conference at 12:00 on March 4th.'
Label: neutral, Score: 0.82
---
Language: chinese
Headline: '"International Community Pays Attention to China's Two Sessions, Awaits China to Contribute More Momentum to Global Development"'
Label: positive, Score: 0.75
---
Language: chinese
Headline: '"Shaanxi National Committee Members Shoulder the Responsibility of Representing the People of Shanxi and Head to Beijing with Enthusiasm to Attend the Conference"'
Label: positive, Score: 0.62
---
Language: chinese
Headline: '"Grasping the Opportunities of the Times, Drawing a Unified Circle Together - National Committee Members of the Chinese People's Political Consultative Conference Actively Fulfill their Duties an

In [21]:
from transformers import pipeline

def analyze_sentiment_and_get_results(translated_headlines):
  """
  Analyzes sentiment for each headline in the translated_headlines dictionary and returns a list of dictionaries.

  Args:
      translated_headlines: A dictionary containing translated headlines, where keys are languages and values are lists of headlines.

  Returns:
      A list of dictionaries, where each dictionary holds the text, label, and score for a headline.
  """
  sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
  results = []

  for language, headlines in translated_headlines.items():
    for headline in headlines:
      sentiment = sentiment_pipeline(headline)
      # Extract label and score

      label = sentiment[0]['label']
      score = sentiment[0]['score']

    # return analyzed_headlines
        
      # Create a dictionary for each headline and its analysis
      result = {
        'language': language,
          "text": headline,
          "label": label,
          "score": score,
      }
      results.append(result)
  return results

# Assuming translated_headlines is your dictionary of translated headlines
results = analyze_sentiment_and_get_results(translated_headlines)
print(results)

[{'language': 'chinese', 'text': '"Special Topic: Press Conference for the Second Session of the 14th National Committee of the Chinese People\'s Political Consultative Conference"', 'label': 'neutral', 'score': 0.9126659035682678}, {'language': 'chinese', 'text': "The 14th National People's Congress will hold a press conference at 12:00 on March 4th.", 'label': 'neutral', 'score': 0.8165149688720703}, {'language': 'chinese', 'text': '"International Community Pays Attention to China\'s Two Sessions, Awaits China to Contribute More Momentum to Global Development"', 'label': 'positive', 'score': 0.7549508810043335}, {'language': 'chinese', 'text': '"Shaanxi National Committee Members Shoulder the Responsibility of Representing the People of Shanxi and Head to Beijing with Enthusiasm to Attend the Conference"', 'label': 'positive', 'score': 0.6190558671951294}, {'language': 'chinese', 'text': '"Grasping the Opportunities of the Times, Drawing a Unified Circle Together - National Committee

In [22]:
for row in results: 
    if row['label'] == 'negative':
        if row['score'] > 0.7:
            row['categ_score'] = -2
        else: 
            row['categ_score'] = -1
    elif row['label'] == 'positive':
        if row['score'] > 0.7:
            row['categ_score'] = 3
        else:
            row['categ_score'] = 2
    else:
        row['categ_score'] = 1
print(results[2]) 

{'language': 'chinese', 'text': '"International Community Pays Attention to China\'s Two Sessions, Awaits China to Contribute More Momentum to Global Development"', 'label': 'positive', 'score': 0.7549508810043335, 'categ_score': 3}


In [23]:
import pandas as pd 

chinese_data = []
arabic_data = []

# Iterate through the results and add elements to respective lists
for item in results:
  if item["language"] == "chinese":
    chinese_data.append(item)
  elif item["language"] == "arabic":
    arabic_data.append(item)

# Create DataFrames from lists
chinese_df = pd.DataFrame(chinese_data)
arabic_df = pd.DataFrame(arabic_data)



In [24]:
# Print the DataFrames
print("Chinese headlines:")
print(chinese_df)
print("\nArabic headlines:")
print(arabic_df)


Chinese headlines:
  language                                               text     label  \
0  chinese  "Special Topic: Press Conference for the Secon...   neutral   
1  chinese  The 14th National People's Congress will hold ...   neutral   
2  chinese  "International Community Pays Attention to Chi...  positive   
3  chinese  "Shaanxi National Committee Members Shoulder t...  positive   
4  chinese  "Grasping the Opportunities of the Times, Draw...   neutral   
5  chinese  Liu Jieyi: The National Committee of the Chine...   neutral   
6  chinese  "Highlights of the First Press Conference of T...  positive   
7  chinese  "National Ear Care Day: 6 Questions and Answer...   neutral   
8  chinese  Liu Jieyi: China's Long-Term Economic Improvem...  positive   
9  chinese  "National Committee of the Chinese People's Po...   neutral   

      score  categ_score  
0  0.912666            1  
1  0.816515            1  
2  0.754951            3  
3  0.619056            2  
4  0.417496         

In [27]:
chinese_df.to_csv("chinese.csv")

In [28]:
arabic_df.to_csv("arabic.csv")

In [29]:
ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
GPTdf-bylanguage.ipynb    GPTtrans-sentiment.qmd    arabic.csv
GPTdf-gemini.ipynb        [1m[36mGPTtrans-sentiment_files[m[m/ chinese.csv
GPTdf-manual.ipynb        GPTtranslate.ipynb        clean_GPTscrape.ipynb
GPTdf-russian.ipynb       GPTtranslate.qmd          results.csv
GPTscrape.ipynb           GPTviz-gemini.ipynb       wider_GPTscrape.ipynb
GPTtrans-sentiment.html   GPTviz-mistral.ipynb
GPTtrans-sentiment.ipynb  README.md


In [25]:
# from transformers import pipeline

# def analyze_sentiment_and_print(translated_headlines):
#     sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
    
#     for language, headlines in translated_headlines.items():
#         # print(f"Language: {language}")
#         for headline in headlines:
#             sentiment = sentiment_pipeline(headline)
#             print(f"Language: {language}")
#             print(f"Headline: '{headline}'")
#             print(f"Label: {sentiment[0]['label']}, Score: {sentiment[0]['score']:.2f}")
#             print("---")  # Separator for readability
#     # return analyzed_headlines
# # Assuming translated_headlines is your dictionary of translated headlines
# analyze_sentiment_and_print(translated_headlines)


In [10]:
for row in results: 
    if row['label'] == 'negative':
        if row['score'] > 0.7:
            row['categ_score'] = -2
        else: 
            row['categ_score'] = -1
    elif row['label'] == 'positive':
        if row['score'] > 0.7:
            row['categ_score'] = 3
        else:
            row['categ_score'] = 2
    else:
        row['categ_score'] = 1
print(results[2]) 

NameError: name 'results' is not defined

In [None]:
# import pandas as pd
# resultsdf = pd.DataFrame(results)
# resultsdf 

In [None]:
# What I'm going to work next: 
# Find free websites in German, French, Spanish, Russian
# Translate all the main headlines 
# Do a sentiment analysis on the headlines using a LLM
# calculate the avg and median scores for each language
#COMPARE the total score for each language's website 

In [None]:
#FIND PYTHON library to visualize DIRECTLY on jupyter notebook