In [1]:
from app.models import Session, Headline, Article, Agency
import pandas as pd
from datetime import datetime as dt, timedelta as td

with Session() as s:
    data = s.query(Headline.processed, Headline.first_accessed, Article.url, Agency.name, Agency._bias)\
        .join(Headline.article).join(Article.agency)\
        .filter(Headline.first_accessed > dt.now() - td(hours=12)).all()
df = pd.DataFrame(data, columns=['title', 'date', 'url', 'agency', 'bias'])
df.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\malan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\malan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\malan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\malan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\malan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,date,url,agency,bias
0,Video captures moment of Israeli bombing in no...,2024-04-23 05:00:02.868398,https://www.aljazeera.com/news/liveblog/2024/4...,Al Jazeera,-1
1,Chaotic Scene As NYPD Descends Upon NYU in Rio...,2024-04-23 05:00:02.868398,https://redstate.com/bobhoge/2024/04/23/nypd-s...,Red State,3
2,Starlink highlights economic security challeng...,2024-04-23 05:00:02.868398,https://www.japantimes.co.jp/commentary/2024/0...,The Japan Times,0
3,Mark Hamill has eyebrow-raising opinion on 'th...,2024-04-23 05:00:02.868398,https://www.theblaze.com/news/mark-hamill-best...,The Blaze,3
4,"Pierre Hermé, French chocolatier: 'Learning to...",2024-04-23 05:00:02.868398,https://www.lemonde.fr/en/gastronomie/article/...,Le Monde,-1


In [2]:
# create separate dataframes for each individual day
dfs = [group for _, group in df.groupby(df['date'].dt.date)]
dfs[0].head()

Unnamed: 0,title,date,url,agency,bias
0,Video captures moment of Israeli bombing in no...,2024-04-23 05:00:02.868398,https://www.aljazeera.com/news/liveblog/2024/4...,Al Jazeera,-1
1,Chaotic Scene As NYPD Descends Upon NYU in Rio...,2024-04-23 05:00:02.868398,https://redstate.com/bobhoge/2024/04/23/nypd-s...,Red State,3
2,Starlink highlights economic security challeng...,2024-04-23 05:00:02.868398,https://www.japantimes.co.jp/commentary/2024/0...,The Japan Times,0
3,Mark Hamill has eyebrow-raising opinion on 'th...,2024-04-23 05:00:02.868398,https://www.theblaze.com/news/mark-hamill-best...,The Blaze,3
4,"Pierre Hermé, French chocolatier: 'Learning to...",2024-04-23 05:00:02.868398,https://www.lemonde.fr/en/gastronomie/article/...,Le Monde,-1


from app.analysis.clustering import label_clusters, form_clusters, prepare_cosine

for df in dfs:
    print("Processing", df['date'].iloc[0])
    df = label_clusters(df, form_clusters(prepare_cosine(df['title']), threshold=0.5, min_samples=5))
    # merge all headlines in the same cluster
    date = df['date'].iloc[0]
    df = df[['title', 'cluster']]
    df = df.groupby('cluster').agg({'title': ' '.join}).reset_index()
    df.to_csv(date.strftime('%Y-%m-%d.csv'), index=False)
dfs[0].head()

from transformers import pipeline
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
summaries = []
for df in dfs:
    for key, group in df.groupby('cluster'):
        text = '\n'.join(group['title'])[:1024]
        summaries.append([key, text, summarizer(text, min_length=10, max_length=50, do_sample=True)[0]['summary_text']])
df = pd.DataFrame(summaries, columns=['cluster', 'text', 'summary'])
df.to_csv('summaries.csv', index=False)

In [3]:
df = pd.read_csv('summaries.csv')
df.head()

Unnamed: 0,cluster,text,summary
0,0,European stocks head for positive open despite...,European stocks head for positive open despite...
1,1,Mookie Betts and the bullpen lead Dodgers past...,Mookie Betts tied his career high with five hi...
2,2,Former Senator and Florida Governor Bob Graham...,Former U.S. Sen. and two-term Florida Gov. Bob...
3,3,4 big takeaways from Day 2 of Trump's hush mon...,First 7 jurors for Trump's hush money trial ha...
4,4,Australia's Great Barrier Reef experiencing wo...,Australia's Great Barrier Reef experiencing wo...


In [4]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
valid_dataset = train_test_split['test']

In [5]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

def preprocess_function(examples):
    inputs = examples["text"]
    targets = examples["summary"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Prepare labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = [label if label != tokenizer.pad_token_id else -100 for label in labels["input_ids"]]
    return model_inputs

# Apply the preprocessing function
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=['cluster', 'text', 'summary'])
valid_dataset = valid_dataset.map(preprocess_function, batched=True, remove_columns=['cluster', 'text', 'summary'])



Map:   0%|          | 0/69 [00:00<?, ? examples/s]



Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [6]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 69
})

In [7]:
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [8]:
import torch

# Check if CUDA is available
print("CUDA available:", torch.cuda.is_available())

# List available CUDA devices
if torch.cuda.is_available():
    print("List of CUDA devices:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")


CUDA available: True
List of CUDA devices: 1
Device 0: NVIDIA GeForce RTX 3080


In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=10,
    save_strategy='steps',
    save_steps=500,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset           # evaluation dataset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [10]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss
10,9.7991,9.027433
20,8.8564,8.129132
30,8.1052,7.168039
40,6.5974,5.579607
50,4.997,3.423472
60,3.0547,1.866109
70,1.8515,1.139357
80,1.174,0.717298
90,0.7857,0.501656
100,0.5652,0.374226


TrainOutput(global_step=105, training_loss=4.381633009229388, metrics={'train_runtime': 166.647, 'train_samples_per_second': 1.242, 'train_steps_per_second': 0.63, 'total_flos': 448590652637184.0, 'train_loss': 4.381633009229388, 'epoch': 3.0})

In [11]:
model.save_pretrained('./fine_tuned_bart')
tokenizer.save_pretrained('./fine_tuned_bart')

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./fine_tuned_bart\\tokenizer_config.json',
 './fine_tuned_bart\\special_tokens_map.json',
 './fine_tuned_bart\\vocab.json',
 './fine_tuned_bart\\merges.txt',
 './fine_tuned_bart\\added_tokens.json')

In [12]:
# evaluate
trainer.evaluate()

In [13]:
results = trainer.evaluate()

In [14]:
results

{'eval_loss': 0.3347630798816681,
 'eval_runtime': 3.246,
 'eval_samples_per_second': 2.465,
 'eval_steps_per_second': 1.232,
 'epoch': 3.0}