In [1]:
import os
import boto3
import pandas as pd

from io import BytesIO
from dotenv import load_dotenv

load_dotenv()

ACCESS_KEY = os.getenv('AK')
SECRET_KEY = os.getenv('SK')

In [2]:
s3_client = boto3.client('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
bucket_objs = s3_client.list_objects_v2(Bucket="unlabeled-news")
keys = [elem['Key'] for elem in bucket_objs['Contents']]

In [3]:
keys

['aevangelides/unlabeled_dataset.parquet.gzip',
 'amarinnunezdearce/unlabeled_dataset.parquet.gzip',
 'amontoya/unlabeled_dataset.parquet.gzip',
 'apillai/unlabeled_dataset.parquet.gzip',
 'ctoruno/unlabeled_dataset.parquet.gzip',
 'dbarba/unlabeled_dataset.parquet.gzip',
 'dhabiby/unlabeled_dataset.parquet.gzip',
 'ecampbell/unlabeled_dataset.parquet.gzip',
 'gdurbano/unlabeled_dataset.parquet.gzip',
 'ghulseman/unlabeled_dataset.parquet.gzip',
 'gnunezperalta/unlabeled_dataset.parquet.gzip',
 'hortiz/unlabeled_dataset.parquet.gzip',
 'hrigazzi/unlabeled_dataset.parquet.gzip',
 'hsoukireyes/unlabeled_dataset.parquet.gzip',
 'jcullen/unlabeled_dataset.parquet.gzip',
 'jdavis/unlabeled_dataset.parquet.gzip',
 'kwanner/unlabeled_dataset.parquet.gzip',
 'lcleary/unlabeled_dataset.parquet.gzip',
 'llittlejohn/unlabeled_dataset.parquet.gzip',
 'lsolis/unlabeled_dataset.parquet.gzip',
 'mbasystiuk/unlabeled_dataset.parquet.gzip',
 'mrodriguez/unlabeled_dataset.parquet.gzip',
 'mwoodbury/unla

In [4]:
dfs = [pd.read_parquet(BytesIO(s3_client.get_object(Bucket="unlabeled-news", Key=elem['Key'])['Body'].read())) for elem in bucket_objs['Contents']]
full_data = pd.concat(dfs).reset_index(drop=True).copy()

In [5]:
[len(df) for df in dfs]

[203,
 203,
 203,
 204,
 203,
 203,
 204,
 203,
 204,
 203,
 203,
 203,
 203,
 204,
 204,
 203,
 204,
 204,
 204,
 203,
 204,
 203,
 203,
 203,
 203,
 203]

In [6]:
article_ids = [df.article_id.unique().tolist() for df in dfs]

In [7]:
common_article_ids = set(article_ids[0])
for s in article_ids[1:]:
    common_article_ids.intersection_update(s)
print(common_article_ids)

{'94cdcf3920a871dd8ab8c2574e7bab97', 'e351e199119846c040d8aa29c464247d', '79d1f83cba0c3018a10bcd56630639f7', 'c0ecb227f645932ff8c7756a29efcde6', 'cd8bb7485063a1b057cfb1f59305de07', '545c1bf4c41991f67ae3e0ea3394b360', '0e7248e15d70813de7e14b28ebd8ca3d', 'cca17a78b6697eb9d93a17030638595d', '2c886c2fc437d97bcf6efc783c6135e8', '4be96168a1d9a163a2546cf3ff65eabc', 'ff0aa25f8182aa9c73decdf3f8bfb05f', '6fa6a90d9c2f9895992f98bf01f36b50', '5742083e52f0b28cb5fcc823aac05b89', '1e2e763ff24b2185dda89dcbe5927066', '115f439badbba1bea9e72d4b0071be5d', 'e81c3c1704096a3ba01aa2ed6af70ee8', '05cc6e5ba0b291fd717413aa221093e3', '6c2add1a1d9ac120a82f97dd015ff9fd', '78835ef811e4ff0dee77359c2b4770f5', 'b1c82fed689a6c613be35af335d7d76e', '95c51fa4335ef7d0b1161c8802f64892', 'd7d438d45e6114305722148d98a00157', 'b33857368fcb0256e028d4eec8fd7ef2', 'fa07a7d9886fbb61bf4ed851f7a537aa', 'ec389b64dab16604c6463505e31b6966', 'ca82e813effc9a0e1c8bdd0f47a7ed53', '0cefab77ea5d8a106f502aa0d8ee03fd', '33a5693e616ef42586e3d61734

In [8]:
len(common_article_ids)

40

In [11]:
full_data[full_data.article_id.isin(common_article_ids)].reset_index(drop=True).drop_duplicates(subset=['title_eng']).to_csv('common_news_chunk.csv', index=False)

### Estimate OpenAI cost

In [None]:
titles = ' '.join(full_data[full_data.article_id.isin(common_article_ids)].reset_index(drop=True).title_eng.tolist())
descriptions = ' '.join(full_data[full_data.article_id.isin(common_article_ids)].reset_index(drop=True).desc_eng.tolist())
contents = ' '.join(full_data[full_data.article_id.isin(common_article_ids)].reset_index(drop=True).content_eng.tolist())

full_data_text = titles + descriptions + contents
print(len(full_data_text.split(' ')))

In [None]:
title_texts = full_data[full_data.article_id.isin(common_article_ids)].reset_index(drop=True).drop_duplicates(subset=['title_eng']).title_eng.tolist()

In [None]:
pd.DataFrame({'title': title_texts}).to_csv('common_news_title.csv', index=False)

In [None]:
full_data