### Translating and Comparing News Headlines: step by step  

#### Comparing headlines across languages is cumbersome; doing an automated sentiment analysis on a sampling used to be very complex before the advent of ChatGPT

In [39]:
import pathlib
import textwrap
import pandas as pd 
import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown
from dotenv import load_dotenv


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [17]:
import openai
import os
import requests
import bs4

In [19]:
load_dotenv()
api_key = os.environ.get("API_KEY")

In [20]:
openai.api_key = os.getenv("API_KEY")

In [None]:
#<span class="news-preview__title" data-main_top_low_article_title="">
#<span class="tass_pkg_title-xVUT1 tass_pkg_title--inline-9iqZC tass_pkg_title--font_weight_medium-5SS-v tass_pkg_title--variant_h5_default-51cPB tass_pkg_title--color_tass-IRRCy">
".tass_pkg_title-xVUT1 tass_pkg_title--inline-9iqZC tass_pkg_title--font_weight_medium-5SS-v tass_pkg_title--variant_h5_default-51cPB tass_pkg_title--color_tass-IRRCy"

In [21]:
news_sites = {
    "russian" : ("https://tass.com/", ".news-preview__title"),
}
print(news_sites["russian"])

('https://tass.com/', '.news-preview__title')


In [22]:
def fetch_headlines():
    headlines_by_language = {}
    for language, (url, tag) in news_sites.items():
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text, 'lxml')
        headlines = [h.getText() for h in soup.select(tag)[:10]]
        headlines_by_language[language] = headlines
    return headlines_by_language
fetch_headlines()


{'russian': ['Assad says Syria’s preferences at presidential election in Russia are obvious',
  'Shehbaz Sharif elected as new prime minister of Pakistan — PTV News',
  'Hamas says truce agreement with Israel possible within 24-48 hours — agency',
  'Russian forces eliminate over 250 Ukrainian troops in south Donetsk area over past day',
  'Six militants killed in special counter-terrorist operation in Russia’s Ingushetia Region',
  'Germany hopes for swift results of investigation into leaked military talk',
  'Air defense systems wipe out 38 drones over Crimea overnight',
  'UNSC approves statement on Israeli strike on civilians in Gaza Strip',
  'Putin has clear plan of country’s development, 85% of Russians say',
  'Press review: Putin sets out future vision for Russia and Kiev spurns possibility of peace']}

In [10]:
print(headlines_by_language)

{'russian': ['Assad says Syria’s preferences at presidential election in Russia are obvious', 'Shehbaz Sharif elected as new prime minister of Pakistan — PTV News', 'Hamas says truce agreement with Israel possible within 24-48 hours — agency', 'Russian forces eliminate over 250 Ukrainian troops in south Donetsk area over past day', 'Six militants killed in special counter-terrorist operation in Russia’s Ingushetia Region', 'Germany hopes for swift results of investigation into leaked military talk', 'Air defense systems wipe out 38 drones over Crimea overnight', 'UNSC approves statement on Israeli strike on civilians in Gaza Strip', 'Putin has clear plan of country’s development, 85% of Russians say', 'Press review: Putin sets out future vision for Russia and Kiev spurns possibility of peace']}


In [9]:
def translate_headlines(headlines_by_language):
    translated_headlines = {}
    for language, headlines in headlines_by_language.items():
        translated_headlines[language] = []
        for headline in headlines:
            response = openai.Completion.create(
                model="gpt-3.5-turbo-instruct",
                prompt=f"Translate the following {language} headline into English: \"{headline}\"",
                temperature=0.3,
                max_tokens=60
            )
            translated_headline = response.choices[0].text.strip()
            translated_headlines[language].append(translated_headline)
    return translated_headlines
headlines_by_language = fetch_headlines()
translated_headlines = translate_headlines(headlines_by_language)

In [8]:
print(translated_headlines)

{'chinese': ['"Special Topic: Press Conference for the Second Session of the 14th National Committee of the Chinese People\'s Political Consultative Conference"', '"The 14th National People\'s Congress will hold a press conference at 12:00 on March 4th"', '"International Figures Pay Attention to China\'s Two Sessions, Hoping for China to Contribute More Momentum to Global Development"', '"National Committee Member of Shaanxi Province Carries the Heavy Responsibility of Representing the People of Shaanxi, Heads to Beijing with Enthusiasm to Attend Conference"', '"Grasping the Opportunities of the Times, Drawing a Unified Circle Together: National Committee Members of the Chinese People\'s Political Consultative Conference Actively Fulfill their Duties and Offer Suggestions"', "Liu Jieyi: The National Committee of the Chinese People's Political Consultative Conference will continue to focus on the goal and task of China's modernization and offer suggestions and efforts.", '"Highlights of

In [34]:
#NOT THE RIGHT IMPLEMENTATION:
# from transformers import pipeline

# def analyze_sentiment_and_print(headlines_by_language):
#     sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
    
#     for language, headlines in headlines_by_language.items():
#         # print(f"Language: {language}")
#         for headline in headlines:
#             sentiment = sentiment_pipeline(headline)
#             print(f"Language: {language}")
#             print(f"Headline: '{headline}'")
#             print(f"Label: {sentiment[0]['label']}, Score: {sentiment[0]['score']:.2f}")
#             print("---")  # Separator for readability

# # Assuming translated_headlines is your dictionary of translated headlines
# analyze_sentiment_and_print(headlines_by_language)


In [35]:
from transformers import pipeline

def analyze_sentiment_and_get_results(translated_headlines):
  """
  Analyzes sentiment for each headline in the translated_headlines dictionary and returns a list of dictionaries.
  Args:
      translated_headlines: A dictionary containing translated headlines, where keys are languages and values are lists of headlines.
  Returns:
      A list of dictionaries, where each dictionary holds the text, label, and score for a headline.
  """
  sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
  results = []

  for language, headlines in translated_headlines.items():
    for headline in headlines:
      sentiment = sentiment_pipeline(headline)
      # Extract label and score
      label = sentiment[0]['label']
      score = sentiment[0]['score']

    # return analyzed_headlines      
      # Create a dictionary for each headline and its analysis
      result = {
        'language': language,
          "text": headline,
          "label": label,
          "score": score,
      }
      results.append(result)
  return results

# Assuming translated_headlines is your dictionary of translated headlines
results = analyze_sentiment_and_get_results(translated_headlines)
#print(results)
# print("\n".join(results))
for item in results:
    print(f"{item}\n----")


{'language': 'russian', 'text': '"Assad declares that Syria\'s preferences in the Russian presidential election are evident"', 'label': 'neutral', 'score': 0.5216777324676514}
----
{'language': 'russian', 'text': '"Shehbaz Sharif chosen as Pakistan\'s new prime minister — PTV News"', 'label': 'neutral', 'score': 0.5745922923088074}
----
{'language': 'russian', 'text': '"Hamas claims possibility of truce agreement with Israel in 24-48 hours, according to agency"', 'label': 'neutral', 'score': 0.7492167353630066}
----
{'language': 'russian', 'text': '"Russian forces have eliminated more than 250 Ukrainian troops in the southern Donetsk area in the past day"', 'label': 'negative', 'score': 0.6883344650268555}
----
{'language': 'russian', 'text': '"Six Militants Killed in Special Counter-Terrorist Operation in Russia\'s Ingushetia Region"', 'label': 'negative', 'score': 0.7176731824874878}
----
{'language': 'russian', 'text': '"Germany hopes for quick results of investigation into leaked m

In [37]:
for row in results: 
    if row['label'] == 'negative':
        if row['score'] > 0.7:
            row['categ_score'] = -2
        else: 
            row['categ_score'] = -1
    elif row['label'] == 'positive':
        if row['score'] > 0.7:
            row['categ_score'] = 3
        else:
            row['categ_score'] = 2
    else:
        row['categ_score'] = 1
print(results[0]) 

{'language': 'russian', 'text': '"Assad declares that Syria\'s preferences in the Russian presidential election are evident"', 'label': 'neutral', 'score': 0.5216777324676514, 'categ_score': 1}


In [38]:
print(results[2]) 

{'language': 'russian', 'text': '"Hamas claims possibility of truce agreement with Israel in 24-48 hours, according to agency"', 'label': 'neutral', 'score': 0.7492167353630066, 'categ_score': 1}


In [40]:
russian_df = pd.DataFrame(results)

In [41]:
russian_df.to_csv("russian.csv")

In [42]:
ls 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
GPTdf-bylanguage.ipynb    GPTtrans-sentiment.qmd    arabic.csv
GPTdf-gemini.ipynb        [1m[36mGPTtrans-sentiment_files[m[m/ chinese.csv
GPTdf-manual.ipynb        GPTtranslate.ipynb        clean_GPTscrape.ipynb
GPTdf-russian.ipynb       GPTviz-gemini.ipynb       results.csv
GPTscrape.ipynb           GPTviz-mistral.ipynb      russian.csv
GPTtrans-sentiment.ipynb  README.md                 wider_GPTscrape.ipynb


In [27]:
import pandas as pd 

chinese_data = []
arabic_data = []

# Iterate through the results and add elements to respective lists
for item in results:
  if item["language"] == "chinese":
    chinese_data.append(item)
  elif item["language"] == "arabic":
    arabic_data.append(item)

# Create DataFrames from lists
chinese_df = pd.DataFrame(chinese_data)
arabic_df = pd.DataFrame(arabic_data)

# Print the DataFrames
print("Chinese headlines:")
print(chinese_df)
print("\nArabic headlines:")
print(arabic_df)


Chinese headlines:
  language                                               text     label  \
0  chinese  "Special Topic: Press Conference for the Secon...   neutral   
1  chinese  "The 14th National People's Congress will hold...   neutral   
2  chinese  "International Figures Pay Attention to China'...  positive   
3  chinese  "National Committee Member of Shaanxi Province...  positive   
4  chinese  "Grasping the Opportunities of the Times, Draw...   neutral   
5  chinese  Liu Jieyi: The National Committee of the Chine...  positive   
6  chinese  "Highlights of the First Press Conference of T...  positive   
7  chinese  "National Ear Care Day: 6 Questions and Answer...   neutral   
8  chinese  Liu Jieyi: China's Long-Term Economic Outlook ...  positive   
9  chinese  "National Committee of the Chinese People's Po...   neutral   

      score  categ_score  
0  0.912666            1  
1  0.827835            1  
2  0.764347            3  
3  0.742200            3  
4  0.411955         

In [None]:
#count instances ?? so that even if the occurrences of positive is 0, it is countable 

In [None]:
#%env PROVIDER_API_KEY='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'

In [None]:
#%load_ext jupyter_ai_magics


In [None]:
# %%ai chatgpt
# please store resultsdf in a csv

In [None]:
#resultsdf.to_csv("results.csv")

In [None]:
# What I'm going to work next: 
# Find free websites in German, French, Spanigh, Russian
# Create openai prompt to find tags
# Test the prompts 

In [None]:
# What I'm going to work next: 
# Find free websites in German, French, Spanish, Russian
# Translate all the main headlines 
# Do a sentiment analysis on the headlines using a LLM
# calculate the avg and median scores for each language
#COMPARE the total score for each language's website 

In [None]:
#FIND PYTHON library to visualize DIRECTLY on jupyter notebook