In [1]:
import pandas as pd
import numpy as np

from transformers import pipeline

In [2]:
df = pd.read_csv('./shortened_news.csv')

display(df.head())

print("Columns in the DataFrame:", df.columns.tolist())

Unnamed: 0,source,article_url,title,date,shortened_full_text
0,BBC,https://www.bbc.com/news/articles/c9vkxvk91gjo,Owner and architect of Turkey quake collapse h...,3 days ago,A court in Turkey has sentenced the owner and ...
1,BBC,https://www.bbc.com/news/articles/clyvzgjpewgo,Twelve dead in Turkey ammunition factory blast,4 days ago,An explosion at an ammunition and explosives f...
2,BBC,https://www.bbc.com/news/videos/ckgnzg5e9y3o,Watch: Cargo ship tips on its side in Turkey,4 days ago,A ship docked at a port in Istanbul rolled on ...
3,BBC,https://www.bbc.com/news/articles/cvgx3r4nd2mo,'Danger of IS resurgence has doubled' - Syria...,8 days ago,"The risk of an IS resurgence is heightened, he..."
4,BBC,https://www.bbc.com/news/articles/c4gp8kx6r8yo,Trio tackle bike ride for Turkey lift shaft vi...,13 Dec 2024,The brother and friends of a man who was found...


Columns in the DataFrame: ['source', 'article_url', 'title', 'date', 'shortened_full_text']


In [3]:
# We create a new column called 'combined_text' by concatenating the
# 'title' and 'shortened_full_text' columns.
df['combined_text'] = df['title'].astype(str) + " " + df['shortened_full_text'].astype(str)

In [6]:
# We use the "sentiment-analysis" pipeline provided by Hugging Face Transformers.
# It uses a pretrained model (e.g., "distilbert-base-uncased-finetuned-sst-2-english")
# under the hood, which is well-suited for simple sentiment classification tasks.
# We specify 'truncation=True' and 'max_length=512' to ensure that any text 
# longer than 512 tokens is truncated to avoid runtime errors.
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    tokenizer="distilbert-base-uncased-finetuned-sst-2-english",
    truncation=True,
    max_length=512
)



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [7]:
# We create a new column called 'sentiment_result' to store the sentiment
# (i.e., 'POSITIVE' or 'NEGATIVE') and the confidence score.
# The pipeline returns a list of dictionaries with keys like 'label' and 'score'.
sentiments = []
for text in df['combined_text']:
    result = sentiment_pipeline(text)[0]  # The pipeline returns a list of predictions
    sentiments.append(result)

# Convert the list of dictionaries into separate columns in the DataFrame
df['sentiment_label'] = [res['label'] for res in sentiments]
df['sentiment_score'] = [res['score'] for res in sentiments]

In [8]:
# Let's see how many entries are classified as POSITIVE vs. NEGATIVE.
sentiment_counts = df['sentiment_label'].value_counts()
print("Sentiment distribution:\n", sentiment_counts)

# You may also want to compute some basic statistics of the confidence scores.
print("\nAverage sentiment score:", df['sentiment_score'].mean())
print("Max sentiment score:", df['sentiment_score'].max())
print("Min sentiment score:", df['sentiment_score'].min())

Sentiment distribution:
 sentiment_label
NEGATIVE    1608
POSITIVE     644
Name: count, dtype: int64

Average sentiment score: 0.9413357643843968
Max sentiment score: 0.9997830986976624
Min sentiment score: 0.5006784796714783


In [9]:
# We'll display a sample of the DataFrame to observe the newly added columns.
display(df[['source', 'article_url', 'title', 'date', 'shortened_full_text', 
            'combined_text', 'sentiment_label', 'sentiment_score']].head(10))

Unnamed: 0,source,article_url,title,date,shortened_full_text,combined_text,sentiment_label,sentiment_score
0,BBC,https://www.bbc.com/news/articles/c9vkxvk91gjo,Owner and architect of Turkey quake collapse h...,3 days ago,A court in Turkey has sentenced the owner and ...,Owner and architect of Turkey quake collapse h...,NEGATIVE,0.997099
1,BBC,https://www.bbc.com/news/articles/clyvzgjpewgo,Twelve dead in Turkey ammunition factory blast,4 days ago,An explosion at an ammunition and explosives f...,Twelve dead in Turkey ammunition factory blast...,NEGATIVE,0.985122
2,BBC,https://www.bbc.com/news/videos/ckgnzg5e9y3o,Watch: Cargo ship tips on its side in Turkey,4 days ago,A ship docked at a port in Istanbul rolled on ...,Watch: Cargo ship tips on its side in Turkey A...,NEGATIVE,0.983483
3,BBC,https://www.bbc.com/news/articles/cvgx3r4nd2mo,'Danger of IS resurgence has doubled' - Syria...,8 days ago,"The risk of an IS resurgence is heightened, he...",'Danger of IS resurgence has doubled' - Syria...,NEGATIVE,0.584935
4,BBC,https://www.bbc.com/news/articles/c4gp8kx6r8yo,Trio tackle bike ride for Turkey lift shaft vi...,13 Dec 2024,The brother and friends of a man who was found...,Trio tackle bike ride for Turkey lift shaft vi...,NEGATIVE,0.972911
5,BBC,https://www.bbc.com/news/articles/cvgr7v1evvgo,Ethiopia and Somalia agree to end bitter Somal...,12 Dec 2024,Ethiopia and Somalia have agreed to end their ...,Ethiopia and Somalia agree to end bitter Somal...,POSITIVE,0.997438
6,BBC,https://www.bbc.com/news/articles/cd75e8gdy9jo,The global players in Syria before and after A...,10 Dec 2024,Those with a vested interest in the conflict a...,The global players in Syria before and after A...,NEGATIVE,0.975248
7,BBC,https://www.bbc.com/news/articles/cvg6eeg87lqo,Turkey's 3m Syrian refugees face big decision ...,10 Dec 2024,Syrian refugees have been celebrating the fall...,Turkey's 3m Syrian refugees face big decision ...,NEGATIVE,0.959378
8,BBC,https://www.bbc.com/news/articles/cvg6kd41196o,'We're having to investigate our grandson's de...,7 Dec 2024,But less than 36 hours after landing in Turkey...,'We're having to investigate our grandson's de...,NEGATIVE,0.997697
9,BBC,https://www.bbc.com/news/articles/cdr0dvm7rpgo,Man was drinking before lift shaft fall - police,6 Dec 2024,A British tourist found in a lift shaft in a T...,Man was drinking before lift shaft fall - poli...,NEGATIVE,0.998349


In [10]:
df.to_csv('sentiment_analysis_results.csv', index=False)

In [11]:
# ---------------------------------------------------------
# Balancing the Data and Saving to a New CSV
#
# Below is an example of how you can take your DataFrame (df) 
# which contains imbalanced classes (NEGATIVE: 1608, POSITIVE: 644),
# and create a balanced sample of exactly 1000 rows in total:
# 500 NEGATIVE and 500 POSITIVE.
#
# Note: 
# - We assume that you have enough NEGATIVE and POSITIVE rows 
#   to achieve the desired split (500 of each).
# - If POSITIVE class does not have at least 500 rows, you 
#   will need to adjust the sample size accordingly.
# ---------------------------------------------------------

import pandas as pd

# Filter the DataFrame by each sentiment class
df_negative = df[df['sentiment_label'] == 'NEGATIVE']
df_positive = df[df['sentiment_label'] == 'POSITIVE']

# Randomly sample 500 rows from each class
# Adjust the random_state to any integer for reproducibility
df_negative_500 = df_negative.sample(500, random_state=42)
df_positive_500 = df_positive.sample(500, random_state=42)

# Concatenate the two subsets into a single balanced DataFrame
df_balanced = pd.concat([df_negative_500, df_positive_500])

# Shuffle the resulting DataFrame to interleave classes
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced DataFrame to a new CSV
df_balanced.to_csv('balanced_data_1000.csv', index=False)

# 6. Quick verification
print("NEGATIVE count in df_balanced:", sum(df_balanced['sentiment_label'] == 'NEGATIVE'))
print("POSITIVE count in df_balanced:", sum(df_balanced['sentiment_label'] == 'POSITIVE'))
print("Total rows:", len(df_balanced))

NEGATIVE count in df_balanced: 500
POSITIVE count in df_balanced: 500
Total rows: 1000
