### Translating and Comparing News Headlines: step by step  

#### Comparing headlines across languages is cumbersome; doing an automated sentiment analysis on a sampling used to be very complex before the advent of ChatGPT

In [1]:
import pathlib
# import textwrap
import openai
import os
import requests
import bs4
import google.generativeai as genai
from transformers import pipeline
import pandas as pd

from IPython.display import display
from IPython.display import Markdown
from dotenv import load_dotenv

# def to_markdown(text):
#   text = text.replace('•', '  *')
#   return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [2]:
# import openai
# import os
# import requests
# import bs4

In [3]:
load_dotenv()
api_key = os.environ.get("API_KEY")

In [4]:
openai.api_key = os.getenv("API_KEY")

In [5]:
news_sites = {
    "chinese" : ("https://cn.chinadaily.com.cn", "div.Home_content_Item_Text h1 a"),
    "arabic": ("https://aljazeera.net", "h3.article-card__title")
}
print(news_sites["arabic"])

('https://aljazeera.net', 'h3.article-card__title')


In [6]:
def fetch_headlines():
    headlines_by_language = {}
    for language, (url, tag) in news_sites.items():
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text, 'lxml')
        headlines = [h.getText() for h in soup.select(tag)[:10]]
        headlines_by_language[language] = headlines
    return headlines_by_language
fetch_headlines()


{'chinese': ['《中国日报》专访最高人民法院院长张军：促进提升治理效能 以公权力为网暴受害者撑腰',
  '《中国日报》专访最高人民检察院检察长应勇：坚持电信网络诈骗犯罪"打防管控"一体推进 深化国际执法司法合作',
  '甘肃省委书记、省人大常委会主任胡昌升：以一流营商环境支撑经济社会高质量发展',
  '刘结一：应推动谈判解决地区冲突，不要火上浇油',
  '涉及教育、就业、社会保障等民生问题 这场记者会事关你我',
  '刘结一：美国应兑现在中美关系上的承诺',
  '中科院院士刘忠范：企业必须是创新的主体',
  '全国政协委员贺晗：加快推进大模型技术与传统产业深度融合',
  '最高检：公益诉讼检察办案效果凸显 清理生产类固体废物528.9万吨',
  '民生主题记者会丨人社部：社保卡将逐步实现“全国一卡通”'],
 'arabic': ['الحرب على غزة.. قصف مكثف وسط القطاع والقسام تغنم طائرتي “كواد كابتر”',
  'الاحتلال يقر بمقتل ضابط بمعارك غزة والقسام تعلن السيطرة على مسيرتين',
  'الاحتلال يقر بمقتل ضابط بمعارك غزة والقسام تعلن السيطرة على مسيرتين',
  'الشرطة تعتقل 10 بمظاهرات إسرائيلية حاشدة تطالب بصفقة تبادل ورحيل نتنياهو',
  'الموساد و"سي آي إيه" يبحثان جهود التوصل لصفقة مع حماس',
  'تفاصيل عن تصفية 50 مدنيا.. الجزيرة تكشف هوية مسن أعزل أعدمه الاحتلال ...',
  'شابة ورضيعة تلتحقان بشهداء الجوع في قطاع غزة',
  'هنية يناشد العرب والمسلمين سرعة التحرك لإغاثة غزة وعزل إسرائيل',
  'مبارزة انتخابية بين بايدن وترامب في ولاية الحسم',
  'السلطة 

In [7]:
def translate_headlines(headlines_by_language):
    translated_headlines = {}
    for language, headlines in headlines_by_language.items():
        translated_headlines[language] = []
        for headline in headlines:
            response = openai.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=f"Translate the following {language} headline into English: \"{headline}\"",
                temperature=0.3,
                max_tokens=60
            )
            translated_headline = response.choices[0].text.strip()
            translated_headlines[language].append(translated_headline)
    return translated_headlines
headlines_by_language = fetch_headlines()
translated_headlines = translate_headlines(headlines_by_language)

In [8]:
print(translated_headlines)

{'chinese': ['"China Daily Interviews Supreme People\'s Court President Zhang Jun: Promoting Improved Governance Efficiency and Supporting Victims of Online Bullying with State Power"', '"China Daily Interviews Procurator-General of Supreme People\'s Procuratorate, Ying Yong: Insisting on Combating and Controlling Telecommunications and Internet Fraud, Deepening International Law Enforcement and Judicial Cooperation"', '"Gansu Provincial Party Secretary and Chairman of the Provincial People\'s Congress Standing Committee, Hu Changsheng: Supporting High-Quality Economic and Social Development with First-Class Business Environment"', 'Liu Jieyi: Negotiations should be promoted to resolve regional conflicts, not add fuel to the fire.', '"Press Conference Concerning Livelihood Issues Involving Education, Employment, and Social Security Affects You and Me"', 'Liu Jieyi: The United States should fulfill its commitments in US-China relations', '"Academician Liu Zhongfan of Chinese Academy of 

In [9]:
# def analyze_sentiment_and_score_to_dataframes(translated_headlines):
#   sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
#   chinese_data = []
#   arabic_data = []

#   for language, headlines in translated_headlines.items():
#     # Count sentiment categories
#     positive_count = 0
#     negative_count = 0
#     neutral_count = 0

#     # Analyze each headline
#     for headline in headlines:
#       sentiment = sentiment_pipeline(headline)
#       label = sentiment[0]['label']
#       score = sentiment[0]['score']

#       # Assign custom score based on label and score

#       if label == 'negative':
#         if score > 0.7:
#           categ_score = -2
#         else:
#           categ_score = -1
#         negative_count += 1
#       elif label == 'positive':
#         if score > 0.7:
#           categ_score = 3
#         else:
#           categ_score = 2
#         positive_count += 1
#       else:
#         categ_score = 1
#         neutral_count += 1

#       # Create a dictionary for each headline and its analysis
#       result = {
#           'language': language,
#           "text": headline,
#           "label": label,
#           "score": score,
#           "categ_score": categ_score
#       }

#       # Append data based on language
#       if language == 'chinese':
#         chinese_data.append(result)
#       else:
#         arabic_data.append(result)

#     # Create DataFrames and add sentiment counts
#     chinese_df = pd.DataFrame(chinese_data)
#     chinese_df['positive_count'] = positive_count
#     chinese_df['negative_count'] = negative_count
#     chinese_df['neutral_count'] = neutral_count

#     arabic_df = pd.DataFrame(arabic_data)
#     arabic_df['positive_count'] = positive_count
#     arabic_df['negative_count'] = negative_count
#     arabic_df['neutral_count'] = neutral_count

#     # Print sentiment category counts as formatted strings
#     print(f"{language} headlines: {positive_count} positive, {negative_count} negative, {neutral_count} neutral")

#   return chinese_df, arabic_df


# # Assuming translated_headlines is your dictionary of translated headlines
# chinese_df, arabic_df = analyze_sentiment_and_score_to_dataframes(translated_headlines)
# print(chinese_df)
# print(arabic_df)

In [10]:
# print(chinese_df)

In [11]:
def analyze_sentiment_and_score(translated_headlines):
  sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
  data = []

  for language, headlines in translated_headlines.items():
    # Count sentiment categories
    positive = 0
    negative  = 0
    neutral = 0

    # Analyze each headline
    for headline in headlines:
      sentiment = sentiment_pipeline(headline)
      label = sentiment[0]['label']
      score = sentiment[0]['score']

      # Assign custom score based on label and score

      if label == 'negative':
        if score > 0.7:
          categ_score = -2
        else:
          categ_score = -1
        negative += 1
      elif label == 'positive':
        if score > 0.7:
          categ_score = 3
        else:
          categ_score = 2
        positive += 1
      else:
        categ_score = 1
        neutral += 1

      # Create a dictionary for each headline and its analysis
      result = {
          'language': language,
          "text": headline,
          "label": label,
          "score": score,
          "categ_score": categ_score
      }
      data.append(result)
    # print(data)
    return data

data = analyze_sentiment_and_score(translated_headlines)

In [12]:
type(data)

list

In [13]:
print(data[0])

{'language': 'chinese', 'text': '"China Daily Interviews Supreme People\'s Court President Zhang Jun: Promoting Improved Governance Efficiency and Supporting Victims of Online Bullying with State Power"', 'label': 'neutral', 'score': 0.5576910376548767, 'categ_score': 1}


### Scoping issue with "print" command below: 

In [14]:
# def create_df_by_language(data):
#     # Separate data by language
#     chinese_data = [row for row in data if row['language'] == 'chinese']
#     arabic_data = [row for row in data if row['language'] == 'arabic']

#     # Create DataFrames and add sentiment counts
#     chinese_df = pd.DataFrame(chinese_data)
#     chinese_df['positive'] = len([row for row in chinese_data if row['label'] == 'positive'])
#     chinese_df['negative'] = len([row for row in chinese_data if row['label'] == 'negative'])
#     chinese_df['neutral'] = len([row for row in chinese_data if row['label'] == 'neutral'])

#     arabic_df = pd.DataFrame(arabic_data)
#     arabic_df['positive'] = len([row for row in arabic_data if row['label'] == 'positive'])
#     arabic_df['negative'] = len([row for row in arabic_data if row['label'] == 'negative'])
#     arabic_df['neutral'] = len([row for row in arabic_data if row['label'] == 'neutral'])

#     # Print sentiment category counts as formatted strings
#     print(f"Chinese headlines: {chinese_df['positive'].iloc[0]} positive, {chinese_df['negative'].iloc[0]} negative, {chinese_df['neutral'].iloc[0]} neutral")
#     print(f"Arabic headlines: {arabic_df['positive'].iloc[0]} positive, {arabic_df['negative'].iloc[0]} negative, {arabic_df['neutral'].iloc[0]} neutral")

#     return chinese_df, arabic_df

# chinese_df, arabic_df = create_df_by_language(data)
# print("DATAFRAME")
# # print(chinese_df)

In [15]:
def create_df_by_language(data):
    # Separate data by language
    chinese_data = [row for row in data if row['language'] == 'chinese']
    arabic_data = [row for row in data if row['language'] == 'arabic']

    # Create DataFrames and add sentiment counts
    chinese_df = pd.DataFrame(chinese_data)
    chinese_df['positive'] = len([row for row in chinese_data if row['label'] == 'positive'])
    chinese_df['negative'] = len([row for row in chinese_data if row['label'] == 'negative'])
    chinese_df['neutral'] = len([row for row in chinese_data if row['label'] == 'neutral'])

    arabic_df = pd.DataFrame(arabic_data)
    arabic_df['positive'] = len([row for row in arabic_data if row['label'] == 'positive'])
    arabic_df['negative'] = len([row for row in arabic_data if row['label'] == 'negative'])
    arabic_df['neutral'] = len([row for row in arabic_data if row['label'] == 'neutral'])

    # Return the DataFrames
    return chinese_df, arabic_df

# Call the function and assign the returned DataFrames
chinese_df, arabic_df = create_df_by_language(data)

# Print the DataFrames after they are created
print("CHINESE DATAFRAME:")
print(chinese_df)
print("ARABIC DATAFRAME:")
print(arabic_df)

CHINESE DATAFRAME:
  language                                               text     label  \
0  chinese  "China Daily Interviews Supreme People's Court...   neutral   
1  chinese  "China Daily Interviews Procurator-General of ...   neutral   
2  chinese  "Gansu Provincial Party Secretary and Chairman...  positive   
3  chinese  Liu Jieyi: Negotiations should be promoted to ...   neutral   
4  chinese  "Press Conference Concerning Livelihood Issues...   neutral   
5  chinese  Liu Jieyi: The United States should fulfill it...   neutral   
6  chinese  "Academician Liu Zhongfan of Chinese Academy o...   neutral   
7  chinese  "National Committee Member of the Chinese Peop...   neutral   
8  chinese  "Supreme Procuratorate: Outstanding Results in...  positive   
9  chinese  "Ministry of Human Resources and Social Securi...   neutral   

      score  categ_score  positive  negative  neutral  
0  0.557691            1         2         0        8  
1  0.595754            1         2         

In [16]:
print(chinese_df)

  language                                               text     label  \
0  chinese  "China Daily Interviews Supreme People's Court...   neutral   
1  chinese  "China Daily Interviews Procurator-General of ...   neutral   
2  chinese  "Gansu Provincial Party Secretary and Chairman...  positive   
3  chinese  Liu Jieyi: Negotiations should be promoted to ...   neutral   
4  chinese  "Press Conference Concerning Livelihood Issues...   neutral   
5  chinese  Liu Jieyi: The United States should fulfill it...   neutral   
6  chinese  "Academician Liu Zhongfan of Chinese Academy o...   neutral   
7  chinese  "National Committee Member of the Chinese Peop...   neutral   
8  chinese  "Supreme Procuratorate: Outstanding Results in...  positive   
9  chinese  "Ministry of Human Resources and Social Securi...   neutral   

      score  categ_score  positive  negative  neutral  
0  0.557691            1         2         0        8  
1  0.595754            1         2         0        8  
2  0.5

In [17]:
chinese_df.to_csv("chinese.csv")

In [18]:
arabic_df.to_csv("arabic.csv")

In [19]:
ls


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
GPTdf-bylang_count_totals.ipynb  README.md
GPTdf-bylanguage.ipynb           action.yml
GPTdf-gemini.ipynb               arabic.csv
GPTdf-manual.ipynb               chinese.csv
GPTdf-russian.ipynb              clean_GPTscrape.ipynb
GPTscrape.ipynb                  home.html
GPTtrans-sentiment.ipynb         index.html
GPTtrans-sentiment.qmd           old_index.html
[1m[36mGPTtrans-sentiment_files[m[m/        results.csv
GPTtranslate.ipynb               russian.csv
GPTviz-gemini.ipynb              wider_GPTscrape.ipynb
GPTviz-mistral.ipynb


In [20]:
# from transformers import pipeline

# def analyze_sentiment_and_get_results(translated_headlines):
#   sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
#   results = []

#   for language, headlines in translated_headlines.items():
#     for headline in headlines:
#       sentiment = sentiment_pipeline(headline)
#       # Extract label and score

#       label = sentiment[0]['label']
#       score = sentiment[0]['score']

#     # return analyzed_headlines
        
#       # Create a dictionary for each headline and its analysis
#       result = {
#         'language': language,
#           "text": headline,
#           "label": label,
#           "score": score,
#       }
#       results.append(result)
#   return results

# # Assuming translated_headlines is your dictionary of translated headlines
# results = analyze_sentiment_and_get_results(translated_headlines)
# print(results)

In [21]:
# for row in results: 
#     if row['label'] == 'negative':
#         if row['score'] > 0.7:
#             row['categ_score'] = -2
#         else: 
#             row['categ_score'] = -1
#     elif row['label'] == 'positive':
#         if row['score'] > 0.7:
#             row['categ_score'] = 3
#         else:
#             row['categ_score'] = 2
#     else:
#         row['categ_score'] = 1
# print(results[2]) 

In [22]:
# import pandas as pd 

# chinese_data = []
# arabic_data = []

# # Iterate through the results and add elements to respective lists
# for item in results:
#   if item["language"] == "chinese":
#     chinese_data.append(item)
#   elif item["language"] == "arabic":
#     arabic_data.append(item)

# # Create DataFrames from lists
# chinese_df = pd.DataFrame(chinese_data)
# arabic_df = pd.DataFrame(arabic_data)



In [23]:
# # Print the DataFrames
# print("Chinese headlines:")
# print(chinese_df)
# print("\nArabic headlines:")
# print(arabic_df)


In [24]:
# chinese_df.to_csv("chinese.csv")

In [25]:
# arabic_df.to_csv("arabic.csv")

In [26]:
ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
GPTdf-bylang_count_totals.ipynb  README.md
GPTdf-bylanguage.ipynb           action.yml
GPTdf-gemini.ipynb               arabic.csv
GPTdf-manual.ipynb               chinese.csv
GPTdf-russian.ipynb              clean_GPTscrape.ipynb
GPTscrape.ipynb                  home.html
GPTtrans-sentiment.ipynb         index.html
GPTtrans-sentiment.qmd           old_index.html
[1m[36mGPTtrans-sentiment_files[m[m/        results.csv
GPTtranslate.ipynb               russian.csv
GPTviz-gemini.ipynb              wider_GPTscrape.ipynb
GPTviz-mistral.ipynb


In [27]:
# arabic_df['label'].value_counts()

KeyError: 'label'

In [None]:
# from transformers import pipeline

# def analyze_sentiment_and_print(translated_headlines):
#     sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
    
#     for language, headlines in translated_headlines.items():
#         # print(f"Language: {language}")
#         for headline in headlines:
#             sentiment = sentiment_pipeline(headline)
#             print(f"Language: {language}")
#             print(f"Headline: '{headline}'")
#             print(f"Label: {sentiment[0]['label']}, Score: {sentiment[0]['score']:.2f}")
#             print("---")  # Separator for readability
#     # return analyzed_headlines
# # Assuming translated_headlines is your dictionary of translated headlines
# analyze_sentiment_and_print(translated_headlines)


In [None]:
# for row in results: 
#     if row['label'] == 'negative':
#         if row['score'] > 0.7:
#             row['categ_score'] = -2
#         else: 
#             row['categ_score'] = -1
#     elif row['label'] == 'positive':
#         if row['score'] > 0.7:
#             row['categ_score'] = 3
#         else:
#             row['categ_score'] = 2
#     else:
#         row['categ_score'] = 1
# print(results[2]) 

In [None]:
# import pandas as pd
# resultsdf = pd.DataFrame(results)
# resultsdf 

In [None]:
# What I'm going to work next: 
# Find free websites in German, French, Spanish, Russian
# Translate all the main headlines 
# Do a sentiment analysis on the headlines using a LLM
# calculate the avg and median scores for each language
#COMPARE the total score for each language's website 

In [None]:
#FIND PYTHON library to visualize DIRECTLY on jupyter notebook