### Translating and Comparing News Headlines: step by step  

#### Comparing headlines across languages is cumbersome; doing an automated sentiment analysis on a sampling used to be very complex before the advent of ChatGPT

In [1]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [2]:
import openai
import os
import requests
import bs4

In [3]:
os.environ['API_KEY'] = "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"

In [4]:
openai.api_key = os.getenv("API_KEY")

In [5]:
news_sites = {
    "chinese" : ("https://cn.chinadaily.com.cn", "div.Home_content_Item_Text h1 a"),
    "arabic": ("https://aljazeera.net", "h3.article-card__title")
}
print(news_sites["arabic"])

('https://aljazeera.net', 'h3.article-card__title')


In [6]:
def fetch_headlines():
    headlines_by_language = {}
    for language, (url, tag) in news_sites.items():
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text, 'lxml')
        headlines = [h.getText() for h in soup.select(tag)[:10]]
        headlines_by_language[language] = headlines
    return headlines_by_language
fetch_headlines()


{'chinese': ['直播：全国政协十四届二次会议新闻发布会',
  '十四届全国人大二次会议将于3月4日12时举行新闻发布会',
  '全国政协委员张兴赢：中国多举措推动构建公平合理、合作共赢的全球气候治理体系',
  '三江源地区科学研究将猛兽重新带回公众视野',
  '贵州凯里：苗族同胞欢庆“甘囊香”芦笙节',
  '农业农村部：珠江、淮河等流域进入为期四个月春季禁渔期',
  '减税降费红利“落袋” 支持企业创新发展',
  '“两会”受瞩目 国际人士认为中国式现代化将给世界带来更多利好',
  '有力度、有速度、有温度 多领域释放稳中向好发展积极信号',
  '“新中式”服装为啥受追捧'],
 'arabic': ['الحرب على غزة.. قصف عنيف جنوب القطاع ومحادثات مرتقبة بالقاهرة حول الهدنة',
  'رغم غضب نتنياهو هاريس تلتقي غانتس غدا لبحث حرب غزة',
  'رغم غضب نتنياهو هاريس تلتقي غانتس غدا لبحث حرب غزة',
  'الاحتلال يعترف بمقتل 3 جنود بغزة ويقصف مجددا حشودا تنتظر مساعدات',
  'ترامب يفوز بترشيح الجمهوريين في ولايتي ميزوري وميشيغان',
  'واشنطن: كمية المساعدات لغزة لا تكفي وإيصالها جوا ليس بديلا عن البر',
  'مباحثات بين أمير قطر والرئيس الإيراني في الجزائر',
  'اشتباكات بين أجهزة الأمن الفلسطينية ومقاومين في مخيم جنين',
  'مجلس الأمن يعبر عن قلقه إزاء مجزرة دوار النابلسي بغزة',
  'ألمانيا تستقطب أطباء دول الجنوب لمواجهة عجز قطاعها الصحي']}

In [7]:
def translate_headlines(headlines_by_language):
    translated_headlines = {}
    for language, headlines in headlines_by_language.items():
        translated_headlines[language] = []
        for headline in headlines:
            response = openai.Completion.create(
                model="gpt-3.5-turbo-instruct",
                prompt=f"Translate the following {language} headline into English: \"{headline}\"",
                temperature=0.3,
                max_tokens=60
            )
            translated_headline = response.choices[0].text.strip()
            translated_headlines[language].append(translated_headline)
    return translated_headlines
headlines_by_language = fetch_headlines()
translated_headlines = translate_headlines(headlines_by_language)

In [8]:
print(translated_headlines)

{'chinese': ['"Live Broadcast: Press Conference of the Second Session of the 14th National Committee of the Chinese People\'s Political Consultative Conference"', "The 14th National People's Congress will hold a press conference at 12:00 on March 4th.", '"CPPCC Member Zhang Xingying: China Takes Multiple Measures to Promote the Construction of a Fair, Reasonable, and Win-Win Global Climate Governance System"', '"Scientific research in the Three-River-Source Region will bring wild animals back into the public eye"', '"Kaili, Guizhou: Miao People Celebrate "Gannang Xiang" Reed Pipe Festival"', 'Ministry of Agriculture and Rural Affairs: Pearl River, Huai River and other basins enter a four-month spring fishing ban period', '"Tax Cuts and Fee Reduction Benefits "in the Bag" to Support Enterprise Innovation and Development"', '"Two Sessions" Attracts Attention, International Experts Believe Chinese-style Modernization Will Bring More Benefits to the World', '"Strong, Fast, and Warm: Multip

In [9]:
# from transformers import pipeline

# def analyze_sentiment(translated_headlines):
#     sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
#     sentiment_results = {}
    
#     for language, headlines in translated_headlines.items():
#         language_results = []
#         for headline in headlines:
#             sentiment = sentiment_pipeline(headline)
#             sentiment_result = {
#                 'text': headline,
#                 'label': sentiment[0]['label'],
#                 'score': sentiment[0]['score']
#             }
#             language_results.append(sentiment_result)
#         sentiment_results[language] = language_results
    
#     return sentiment_results

# Example usage:
# translated_headlines = translate_headlines(headlines_by_language)
# sentiment_analysis_results = analyze_sentiment(translated_headlines)


In [10]:
from transformers import pipeline

def analyze_sentiment_and_print(translated_headlines):
    sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
    
    for language, headlines in translated_headlines.items():
        # print(f"Language: {language}")
        for headline in headlines:
            sentiment = sentiment_pipeline(headline)
            print(f"Language: {language}")
            print(f"Headline: '{headline}'")
            print(f"Label: {sentiment[0]['label']}, Score: {sentiment[0]['score']:.2f}")
            print("---")  # Separator for readability

# Assuming translated_headlines is your dictionary of translated headlines
analyze_sentiment_and_print(translated_headlines)


Language: chinese
Headline: '"Live Broadcast: Press Conference of the Second Session of the 14th National Committee of the Chinese People's Political Consultative Conference"'
Label: neutral, Score: 0.91
---
Language: chinese
Headline: 'The 14th National People's Congress will hold a press conference at 12:00 on March 4th.'
Label: neutral, Score: 0.82
---
Language: chinese
Headline: '"CPPCC Member Zhang Xingying: China Takes Multiple Measures to Promote the Construction of a Fair, Reasonable, and Win-Win Global Climate Governance System"'
Label: positive, Score: 0.57
---
Language: chinese
Headline: '"Scientific research in the Three-River-Source Region will bring wild animals back into the public eye"'
Label: positive, Score: 0.57
---
Language: chinese
Headline: '"Kaili, Guizhou: Miao People Celebrate "Gannang Xiang" Reed Pipe Festival"'
Label: positive, Score: 0.56
---
Language: chinese
Headline: 'Ministry of Agriculture and Rural Affairs: Pearl River, Huai River and other basins ente

In [11]:
#turn each set of translated headlines into a dataframe that can be saved as CSV  
#consider AUTOMATING the scraping and translating every hour (GitHub Actions)
#visualize 

In [12]:
# results = []
# # for sentences in df:
# for index, row in edf.iterrows():
#     dict = {}
#     sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
#     data = index 
#     rating = sentiment_pipeline(data)
#     print(index)
#     print(rating)
#     #df['score'] = rating
#     # print(df['score'])

#     dict['text'] = index
#     dict['label'] = rating[0]['label']
#     dict['score'] = rating[0]['score']
#     results.append(dict)
# print("translated: ", results) 

In [13]:
from transformers import pipeline

def analyze_sentiment_and_get_results(translated_headlines):
  """
  Analyzes sentiment for each headline in the translated_headlines dictionary and returns a list of dictionaries.

  Args:
      translated_headlines: A dictionary containing translated headlines, where keys are languages and values are lists of headlines.

  Returns:
      A list of dictionaries, where each dictionary holds the text, label, and score for a headline.
  """
  sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
  results = []

  for language, headlines in translated_headlines.items():
    for headline in headlines:
      sentiment = sentiment_pipeline(headline)
      # Extract label and score

      label = sentiment[0]['label']
      score = sentiment[0]['score']

      # Create a dictionary for each headline and its analysis
      result = {
        'language': language,
          "text": headline,
          "label": label,
          "score": score
      }
      results.append(result)

  return results

# Assuming translated_headlines is your dictionary of translated headlines
results = analyze_sentiment_and_get_results(translated_headlines)
print(results)

[{'language': 'chinese', 'text': '"Live Broadcast: Press Conference of the Second Session of the 14th National Committee of the Chinese People\'s Political Consultative Conference"', 'label': 'neutral', 'score': 0.9065374732017517}, {'language': 'chinese', 'text': "The 14th National People's Congress will hold a press conference at 12:00 on March 4th.", 'label': 'neutral', 'score': 0.8165149688720703}, {'language': 'chinese', 'text': '"CPPCC Member Zhang Xingying: China Takes Multiple Measures to Promote the Construction of a Fair, Reasonable, and Win-Win Global Climate Governance System"', 'label': 'positive', 'score': 0.5733625888824463}, {'language': 'chinese', 'text': '"Scientific research in the Three-River-Source Region will bring wild animals back into the public eye"', 'label': 'positive', 'score': 0.5725695490837097}, {'language': 'chinese', 'text': '"Kaili, Guizhou: Miao People Celebrate "Gannang Xiang" Reed Pipe Festival"', 'label': 'positive', 'score': 0.5600460767745972}, 

In [14]:
type(results)


list

In [15]:
import pandas as pd
resultsdf = pd.DataFrame(results)
resultsdf 

Unnamed: 0,language,text,label,score
0,chinese,"""Live Broadcast: Press Conference of the Secon...",neutral,0.906537
1,chinese,The 14th National People's Congress will hold ...,neutral,0.816515
2,chinese,"""CPPCC Member Zhang Xingying: China Takes Mult...",positive,0.573363
3,chinese,"""Scientific research in the Three-River-Source...",positive,0.57257
4,chinese,"""Kaili, Guizhou: Miao People Celebrate ""Gannan...",positive,0.560046
5,chinese,Ministry of Agriculture and Rural Affairs: Pea...,negative,0.548483
6,chinese,"""Tax Cuts and Fee Reduction Benefits ""in the B...",neutral,0.653988
7,chinese,"""Two Sessions"" Attracts Attention, Internation...",positive,0.684258
8,chinese,"""Strong, Fast, and Warm: Multiple Fields Relea...",positive,0.664857
9,chinese,"""Why is ""New Chinese Style"" Clothing in High D...",negative,0.661514


In [16]:
#print(resultsdf)

In [17]:
!pip install jupyter-ai-magics

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[33mDEPRECATION: dexplot 0.1.4 has a non-standard dependency specifier scipy>=1.0matplotlib>=3.1. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of dexplot or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [18]:
%env PROVIDER_API_KEY='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'

env: PROVIDER_API_KEY='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'


In [19]:
%load_ext jupyter_ai_magics


In [25]:
# %%ai chatgpt
# please store resultsdf in a csv

In [26]:
resultsdf.to_csv("results.csv")

In [21]:
# What I'm going to work next: 
# Find free websites in German, French, Spanigh, Russian
# Create openai prompt to find tags
# Test the prompts 

In [22]:
# What I'm going to work next: 
# Find free websites in German, French, Spanish, Russian
# Translate all the main headlines 
# Do a sentiment analysis on the headlines using a LLM
# calculate the avg and median scores for each language
#COMPARE the total score for each language's website 

In [23]:
#FIND PYTHON library to visualize DIRECTLY on jupyter notebook