### Translating and Comparing News Headlines: step by step  

#### Comparing headlines across languages is cumbersome; doing an automated sentiment analysis on a sampling used to be very complex before the advent of ChatGPT

In [1]:
import pathlib
import textwrap
import openai
import os
import requests
import bs4
import google.generativeai as genai
from transformers import pipeline
import pandas as pd

from IPython.display import display
from IPython.display import Markdown
from dotenv import load_dotenv

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [2]:
# import openai
# import os
# import requests
# import bs4

In [3]:
load_dotenv()
api_key = os.environ.get("API_KEY")

In [4]:
openai.api_key = os.getenv("API_KEY")

In [5]:
news_sites = {
    "chinese" : ("https://cn.chinadaily.com.cn", "div.Home_content_Item_Text h1 a"),
    "arabic": ("https://aljazeera.net", "h3.article-card__title")
}
print(news_sites["arabic"])

('https://aljazeera.net', 'h3.article-card__title')


In [6]:
def fetch_headlines():
    headlines_by_language = {}
    for language, (url, tag) in news_sites.items():
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text, 'lxml')
        headlines = [h.getText() for h in soup.select(tag)[:10]]
        headlines_by_language[language] = headlines
    return headlines_by_language
fetch_headlines()


{'chinese': ['专题：全国政协十四届二次会议开幕会',
  '专题：十四届全国人大二次会议新闻发布会',
  '十四届全国人大二次会议3月5日上午开幕 会期7天',
  '两会世界眼｜观察中国全过程人民民主的重要窗口',
  '外交部：欢迎各国工商界继续加强对华合作',
  '全国政协十四届二次会议首场“委员通道”集体采访活动举行',
  '【100秒“瞧”见两会】阿拉伯记者秀中文 为报道两会做足功课',
  '【世界看两会】外媒：中国两会是国家善治、经济社会繁荣的生动展现',
  '法企高管：中国绿色经济及低碳领域合作已成为外资关注的“聚光点”',
  '中国两会汇聚世界关注与期待 感受中国经济活力与韧性'],
 'arabic': ['الحرب على غزة.. احتدام القتال غرب خان يونس وتصاعد الخلاف بمجلس الحرب الإسرائيلي',
  'شاهد.. القسام تستهدف مدرعات الاحتلال وجنوده في تل الهوى',
  'شاهد.. القسام تستهدف مدرعات الاحتلال وجنوده في تل الهوى',
  'أكثر من 22 ألف شهيد من الأطفال والنساء في غزة منذ بدء العدوان',
  'الاحتلال ينسف منازل شرق خان يونس والقسام تعلن قنص جنديين',
  'حريق بسفينة شحن إسرائيلية بعد تعرضها لانفجارين قبالة سواحل عدن',
  'أردوغان: تركيا تفعل ما بوسعها من أجل غزة',
  'استقالات واسعة في طاقم متحدث الجيش الإسرائيلي',
  'المحكمة العليا الأميركية ترفض حكم كولورادو بعدم أهلية ترامب للترشح ...',
  'المدربون العسكريون.. نقطة ارتكاز النفوذ الإسرائيلي في أفريقيا']}

In [7]:
def translate_headlines(headlines_by_language):
    translated_headlines = {}
    for language, headlines in headlines_by_language.items():
        translated_headlines[language] = []
        for headline in headlines:
            response = openai.Completion.create(
                model="gpt-3.5-turbo-instruct",
                prompt=f"Translate the following {language} headline into English: \"{headline}\"",
                temperature=0.3,
                max_tokens=60
            )
            translated_headline = response.choices[0].text.strip()
            translated_headlines[language].append(translated_headline)
    return translated_headlines
headlines_by_language = fetch_headlines()
translated_headlines = translate_headlines(headlines_by_language)

In [8]:
print(translated_headlines)

{'chinese': ['"Special Topic: Opening Ceremony of the Second Session of the 14th National Committee of the Chinese People\'s Political Consultative Conference"', "Special Topic: Press Conference of the Second Session of the 14th National People's Congress", '"The 14th National People\'s Congress Second Session Opens on March 5th, Lasting 7 Days"', '"Two Sessions World Eye | Observing the Important Window of People\'s Democracy in China\'s Entire Process"', 'Ministry of Foreign Affairs: Welcome the business communities of all countries to continue strengthening cooperation with China.', 'The first "Committee Channel" collective interview activity of the 14th National Committee of the Chinese People\'s Political Consultative Conference was held.', '"【100 Seconds of "Seeing" the Two Sessions】Arab Journalist Shows Off Chinese, Does Homework for Reporting on the Two Sessions"', '"World Watches Two Sessions: Foreign Media Views China\'s Two Sessions as a Vivid Display of Good Governance and 

In [9]:
# from transformers import pipeline

# def analyze_sentiment_and_print(translated_headlines):
#     sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
    
#     for language, headlines in translated_headlines.items():
#         # print(f"Language: {language}")
#         for headline in headlines:
#             sentiment = sentiment_pipeline(headline)
#             print(f"Language: {language}")
#             print(f"Headline: '{headline}'")
#             print(f"Label: {sentiment[0]['label']}, Score: {sentiment[0]['score']:.2f}")
#             print("---")  # Separator for readability
    

# # Assuming translated_headlines is your dictionary of translated headlines
# analyze_sentiment_and_print(translated_headlines)


In [10]:

# def analyze_sentiment_and_score_to_dataframe(translated_headlines):
#   sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
#   data = []

#   for language, headlines in translated_headlines.items():
#     # Count sentiment categories
#     positive_count = 0
#     negative_count = 0
#     neutral_count = 0

#     # Analyze each headline
#     for headline in headlines:
#       sentiment = sentiment_pipeline(headline)
#       label = sentiment[0]['label']
#       score = sentiment[0]['score']

#       # Assign custom score based on label and score

#       if label == 'negative':
#         if score > 0.7:
#           categ_score = -2
#         else:
#           categ_score = -1
#         negative_count += 1
#       elif label == 'positive':
#         if score > 0.7:
#           categ_score = 3
#         else:
#           categ_score = 2
#         positive_count += 1
#       else:
#         categ_score = 1
#         neutral_count += 1

#       # Create a dictionary for each headline and its analysis
#       result = {
#           'language': language,
#           "text": headline,
#           "label": label,
#           "score": score,
#           "categ_score": categ_score
#       }
#       data.append(result)

#     # Create DataFrame with sentiment counts as additional columns
#     df = pd.DataFrame(data)
#     df['positive_count'] = positive_count
#     df['negative_count'] = negative_count
#     df['neutral_count'] = neutral_count

#     # Print sentiment category counts (optional)
#     print(f"{language} headlines: {positive_count} positive, {negative_count} negative, {neutral_count} neutral")

#   return df

# # Assuming translated_headlines is your dictionary of translated headlines
# df = analyze_sentiment_and_score_to_dataframe(translated_headlines)
# print(df)

In [11]:
# def analyze_sentiment_and_score_to_dataframes(translated_headlines):
#   sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
#   chinese_data = []
#   arabic_data = []

#   for language, headlines in translated_headlines.items():
#     # Count sentiment categories
#     positive_count = 0
#     negative_count = 0
#     neutral_count = 0

#     # Analyze each headline
#     for headline in headlines:
#       sentiment = sentiment_pipeline(headline)
#       label = sentiment[0]['label']
#       score = sentiment[0]['score']

#       # Assign custom score based on label and score

#       if label == 'negative':
#         if score > 0.7:
#           categ_score = -2
#         else:
#           categ_score = -1
#         negative_count += 1
#       elif label == 'positive':
#         if score > 0.7:
#           categ_score = 3
#         else:
#           categ_score = 2
#         positive_count += 1
#       else:
#         categ_score = 1
#         neutral_count += 1

#       # Create a dictionary for each headline and its analysis
#       result = {
#           'language': language,
#           "text": headline,
#           "label": label,
#           "score": score,
#           "categ_score": categ_score
#       }

#       # Append data based on language
#       if language == 'chinese':
#         chinese_data.append(result)
#       else:
#         arabic_data.append(result)

#     # Create DataFrames and add sentiment counts
#     chinese_df = pd.DataFrame(chinese_data)
#     chinese_df['positive_count'] = positive_count
#     chinese_df['negative_count'] = negative_count
#     chinese_df['neutral_count'] = neutral_count

#     arabic_df = pd.DataFrame(arabic_data)
#     arabic_df['positive_count'] = positive_count
#     arabic_df['negative_count'] = negative_count
#     arabic_df['neutral_count'] = neutral_count

#     # Print sentiment category counts as formatted strings
#     print(f"{language} headlines: {positive_count} positive, {negative_count} negative, {neutral_count} neutral")

#   return chinese_df, arabic_df


# # Assuming translated_headlines is your dictionary of translated headlines
# chinese_df, arabic_df = analyze_sentiment_and_score_to_dataframes(translated_headlines)
# print(chinese_df)
# print(arabic_df)

In [12]:
print(chinese_df)

NameError: name 'chinese_df' is not defined

In [28]:
def analyze_sentiment_and_score(translated_headlines):
  sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
  data = []

  for language, headlines in translated_headlines.items():
    # Count sentiment categories
    positive = 0
    negative  = 0
    neutral = 0

    # Analyze each headline
    for headline in headlines:
      sentiment = sentiment_pipeline(headline)
      label = sentiment[0]['label']
      score = sentiment[0]['score']

      # Assign custom score based on label and score

      if label == 'negative':
        if score > 0.7:
          categ_score = -2
        else:
          categ_score = -1
        negative += 1
      elif label == 'positive':
        if score > 0.7:
          categ_score = 3
        else:
          categ_score = 2
        positive += 1
      else:
        categ_score = 1
        neutral += 1

      # Create a dictionary for each headline and its analysis
      result = {
          'language': language,
          "text": headline,
          "label": label,
          "score": score,
          "categ_score": categ_score
      }
      data.append(result)
    # print(data)
    return data

data = analyze_sentiment_and_score(translated_headlines)

In [29]:
type(data)

list

In [30]:
print(data[0])

{'language': 'chinese', 'text': '"Special Topic: Opening Ceremony of the Second Session of the 14th National Committee of the Chinese People\'s Political Consultative Conference"', 'label': 'neutral', 'score': 0.9023157954216003, 'categ_score': 1}


### Scoping issue with "print" command below: 

In [35]:
# def create_df_by_language(data):
#     # Separate data by language
#     chinese_data = [row for row in data if row['language'] == 'chinese']
#     arabic_data = [row for row in data if row['language'] == 'arabic']

#     # Create DataFrames and add sentiment counts
#     chinese_df = pd.DataFrame(chinese_data)
#     chinese_df['positive'] = len([row for row in chinese_data if row['label'] == 'positive'])
#     chinese_df['negative'] = len([row for row in chinese_data if row['label'] == 'negative'])
#     chinese_df['neutral'] = len([row for row in chinese_data if row['label'] == 'neutral'])

#     arabic_df = pd.DataFrame(arabic_data)
#     arabic_df['positive'] = len([row for row in arabic_data if row['label'] == 'positive'])
#     arabic_df['negative'] = len([row for row in arabic_data if row['label'] == 'negative'])
#     arabic_df['neutral'] = len([row for row in arabic_data if row['label'] == 'neutral'])

#     # Print sentiment category counts as formatted strings
#     print(f"Chinese headlines: {chinese_df['positive'].iloc[0]} positive, {chinese_df['negative'].iloc[0]} negative, {chinese_df['neutral'].iloc[0]} neutral")
#     print(f"Arabic headlines: {arabic_df['positive'].iloc[0]} positive, {arabic_df['negative'].iloc[0]} negative, {arabic_df['neutral'].iloc[0]} neutral")

#     return chinese_df, arabic_df

# chinese_df, arabic_df = create_df_by_language(data)
# print("DATAFRAME")
# # print(chinese_df)

In [36]:
def create_df_by_language(data):
    # Separate data by language
    chinese_data = [row for row in data if row['language'] == 'chinese']
    arabic_data = [row for row in data if row['language'] == 'arabic']

    # Create DataFrames and add sentiment counts
    chinese_df = pd.DataFrame(chinese_data)
    chinese_df['positive'] = len([row for row in chinese_data if row['label'] == 'positive'])
    chinese_df['negative'] = len([row for row in chinese_data if row['label'] == 'negative'])
    chinese_df['neutral'] = len([row for row in chinese_data if row['label'] == 'neutral'])

    arabic_df = pd.DataFrame(arabic_data)
    arabic_df['positive'] = len([row for row in arabic_data if row['label'] == 'positive'])
    arabic_df['negative'] = len([row for row in arabic_data if row['label'] == 'negative'])
    arabic_df['neutral'] = len([row for row in arabic_data if row['label'] == 'neutral'])

    # Return the DataFrames
    return chinese_df, arabic_df

# Call the function and assign the returned DataFrames
chinese_df, arabic_df = create_df_by_language(data)

# Print the DataFrames after they are created
print("CHINESE DATAFRAME:")
print(chinese_df)
print("ARABIC DATAFRAME:")
print(arabic_df)

CHINESE DATAFRAME:
  language                                               text     label  \
0  chinese  "Special Topic: Opening Ceremony of the Second...   neutral   
1  chinese  Special Topic: Press Conference of the Second ...   neutral   
2  chinese  "The 14th National People's Congress Second Se...   neutral   
3  chinese  "Two Sessions World Eye | Observing the Import...   neutral   
4  chinese  Ministry of Foreign Affairs: Welcome the busin...  positive   
5  chinese  The first "Committee Channel" collective inter...   neutral   
6  chinese  "【100 Seconds of "Seeing" the Two Sessions】Ara...   neutral   
7  chinese  "World Watches Two Sessions: Foreign Media Vie...  positive   
8  chinese  "Executive of Chinese Companies: Cooperation i...   neutral   
9  chinese  "China's Two Sessions Gather Global Attention ...  positive   

      score  categ_score  positive  negative  neutral  
0  0.902316            1         3         0        7  
1  0.883254            1         3         

In [32]:
print(chinese_df)

NameError: name 'chinese_df' is not defined

In [None]:
# from transformers import pipeline

# def analyze_sentiment_and_get_results(translated_headlines):
#   sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
#   results = []

#   for language, headlines in translated_headlines.items():
#     for headline in headlines:
#       sentiment = sentiment_pipeline(headline)
#       # Extract label and score

#       label = sentiment[0]['label']
#       score = sentiment[0]['score']

#     # return analyzed_headlines
        
#       # Create a dictionary for each headline and its analysis
#       result = {
#         'language': language,
#           "text": headline,
#           "label": label,
#           "score": score,
#       }
#       results.append(result)
#   return results

# # Assuming translated_headlines is your dictionary of translated headlines
# results = analyze_sentiment_and_get_results(translated_headlines)
# print(results)

In [None]:
# for row in results: 
#     if row['label'] == 'negative':
#         if row['score'] > 0.7:
#             row['categ_score'] = -2
#         else: 
#             row['categ_score'] = -1
#     elif row['label'] == 'positive':
#         if row['score'] > 0.7:
#             row['categ_score'] = 3
#         else:
#             row['categ_score'] = 2
#     else:
#         row['categ_score'] = 1
# print(results[2]) 

In [None]:
# import pandas as pd 

# chinese_data = []
# arabic_data = []

# # Iterate through the results and add elements to respective lists
# for item in results:
#   if item["language"] == "chinese":
#     chinese_data.append(item)
#   elif item["language"] == "arabic":
#     arabic_data.append(item)

# # Create DataFrames from lists
# chinese_df = pd.DataFrame(chinese_data)
# arabic_df = pd.DataFrame(arabic_data)



In [None]:
# # Print the DataFrames
# print("Chinese headlines:")
# print(chinese_df)
# print("\nArabic headlines:")
# print(arabic_df)


In [None]:
# chinese_df.to_csv("chinese.csv")

In [None]:
# arabic_df.to_csv("arabic.csv")

In [None]:
ls

In [None]:
arabic_df['label'].value_counts()

In [None]:
# from transformers import pipeline

# def analyze_sentiment_and_print(translated_headlines):
#     sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
    
#     for language, headlines in translated_headlines.items():
#         # print(f"Language: {language}")
#         for headline in headlines:
#             sentiment = sentiment_pipeline(headline)
#             print(f"Language: {language}")
#             print(f"Headline: '{headline}'")
#             print(f"Label: {sentiment[0]['label']}, Score: {sentiment[0]['score']:.2f}")
#             print("---")  # Separator for readability
#     # return analyzed_headlines
# # Assuming translated_headlines is your dictionary of translated headlines
# analyze_sentiment_and_print(translated_headlines)


In [None]:
# for row in results: 
#     if row['label'] == 'negative':
#         if row['score'] > 0.7:
#             row['categ_score'] = -2
#         else: 
#             row['categ_score'] = -1
#     elif row['label'] == 'positive':
#         if row['score'] > 0.7:
#             row['categ_score'] = 3
#         else:
#             row['categ_score'] = 2
#     else:
#         row['categ_score'] = 1
# print(results[2]) 

In [None]:
# import pandas as pd
# resultsdf = pd.DataFrame(results)
# resultsdf 

In [None]:
# What I'm going to work next: 
# Find free websites in German, French, Spanish, Russian
# Translate all the main headlines 
# Do a sentiment analysis on the headlines using a LLM
# calculate the avg and median scores for each language
#COMPARE the total score for each language's website 

In [None]:
#FIND PYTHON library to visualize DIRECTLY on jupyter notebook