In [1]:
from app.models import Session, Headline, Article, Agency
import pandas as pd
from datetime import datetime as dt, timedelta as td

with Session() as s:
    data = s.query(Headline.processed, Headline.first_accessed, Article.url, Agency.name, Agency._bias)\
        .join(Headline.article).join(Article.agency)\
        .filter(Headline.first_accessed > dt.now() - td(hours=12)).all()
df = pd.DataFrame(data, columns=['title', 'date', 'url', 'agency', 'bias'])
df.head()

In [2]:
# create separate dataframes for each individual day
dfs = [group for _, group in df.groupby(df['date'].dt.date)]
dfs[0].head()

from app.analysis.clustering import label_clusters, form_clusters, prepare_cosine

for df in dfs:
    print("Processing", df['date'].iloc[0])
    df = label_clusters(df, form_clusters(prepare_cosine(df['title']), threshold=0.5, min_samples=5))
    # merge all headlines in the same cluster
    date = df['date'].iloc[0]
    df = df[['title', 'cluster']]
    df = df.groupby('cluster').agg({'title': ' '.join}).reset_index()
    df.to_csv(date.strftime('%Y-%m-%d.csv'), index=False)
dfs[0].head()

from transformers import pipeline
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
summaries = []
for df in dfs:
    for key, group in df.groupby('cluster'):
        text = '\n'.join(group['title'])[:1024]
        summaries.append([key, text, summarizer(text, min_length=10, max_length=50, do_sample=True)[0]['summary_text']])
df = pd.DataFrame(summaries, columns=['cluster', 'text', 'summary'])
df.to_csv('summaries.csv', index=False)

In [3]:
df = pd.read_csv('summaries.csv')
df.head()

In [4]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
valid_dataset = train_test_split['test']

In [5]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

def preprocess_function(examples):
    inputs = examples["text"]
    targets = examples["summary"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Prepare labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = [label if label != tokenizer.pad_token_id else -100 for label in labels["input_ids"]]
    return model_inputs

# Apply the preprocessing function
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=['cluster', 'text', 'summary'])
valid_dataset = valid_dataset.map(preprocess_function, batched=True, remove_columns=['cluster', 'text', 'summary'])



In [6]:
train_dataset

In [7]:
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [8]:
import torch

# Check if CUDA is available
print("CUDA available:", torch.cuda.is_available())

# List available CUDA devices
if torch.cuda.is_available():
    print("List of CUDA devices:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")


In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=10,
    save_strategy='steps',
    save_steps=500,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset           # evaluation dataset
)

In [10]:
trainer.train()

In [11]:
model.save_pretrained('./fine_tuned_bart')
tokenizer.save_pretrained('./fine_tuned_bart')

In [12]:
# evaluate
trainer.evaluate()

In [13]:
results = trainer.evaluate()

In [14]:
results