In [None]:
import os
import boto3
import pandas as pd

from io import BytesIO
from dotenv import load_dotenv

load_dotenv()

ACCESS_KEY = os.getenv('AK')
SECRET_KEY = os.getenv('SK')

In [None]:
s3_client = boto3.client('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
bucket_objs = s3_client.list_objects_v2(Bucket="unlabeled-news")
keys = [elem['Key'] for elem in bucket_objs['Contents']]

In [None]:
keys

In [None]:
dfs = [pd.read_parquet(BytesIO(s3_client.get_object(Bucket="unlabeled-news", Key=elem['Key'])['Body'].read())) for elem in bucket_objs['Contents']]
full_data = pd.concat(dfs).reset_index(drop=True).copy()

In [None]:
[len(df) for df in dfs]

In [None]:
article_ids = [df.article_id.unique().tolist() for df in dfs]

In [None]:
common_article_ids = set(article_ids[0])
for s in article_ids[1:]:
    common_article_ids.intersection_update(s)
print(common_article_ids)

In [None]:
len(common_article_ids)

In [None]:
full_data[full_data.article_id.isin(common_article_ids)].reset_index(drop=True).drop_duplicates(subset=['title_eng']).to_csv('common_news_chunk.csv', index=False)

### Estimate OpenAI cost

In [None]:
titles = ' '.join(full_data[full_data.article_id.isin(common_article_ids)].reset_index(drop=True).title_eng.tolist())
descriptions = ' '.join(full_data[full_data.article_id.isin(common_article_ids)].reset_index(drop=True).desc_eng.tolist())
contents = ' '.join(full_data[full_data.article_id.isin(common_article_ids)].reset_index(drop=True).content_eng.tolist())

full_data_text = titles + descriptions + contents
print(len(full_data_text.split(' ')))

In [None]:
title_texts = full_data[full_data.article_id.isin(common_article_ids)].reset_index(drop=True).drop_duplicates(subset=['title_eng']).title_eng.tolist()

In [None]:
pd.DataFrame({'title': title_texts}).to_csv('common_news_title.csv', index=False)

In [None]:
full_data