In [11]:
%%time
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from tqdm import trange


def load_and_merge_csv(file_pattern, num_files):
    file_names = [file_pattern.format(i) for i in range(1, num_files + 1)]
    dataframes = [pd.read_csv(filename) for filename in file_names]
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

df = load_and_merge_csv('data_upload/cluster_labels{}.csv', 4)
df = df.loc[range(200)]

CPU times: user 206 ms, sys: 21.7 ms, total: 228 ms
Wall time: 234 ms


In [2]:
df.head()

Unnamed: 0,id,Text,Title,embeddings,Cluster,combined,Common_Theme
0,nos7tzp7jprxlqxe,GENEVA – The remains of a climber discovered i...,Remains found in Swiss Alps are those of Briti...,"[0.063923, 0.065677, -0.001089, 0.065425, -0.0...",17,Title: Remains found in Swiss Alps are those o...,Violence and Injustice
1,zvv4ue0w64vfqoz1,Ms Greta Thunburg became a household name when...,Involve youth in shaping ethical use of AI,"[0.063668, 0.098002, -0.022514, -0.033031, -0....",3,Title: Involve youth in shaping ethical use of...,"Technology, Sustainability, and Social Impact"
2,aph1tgua3xxoq2sg,NEW YORK - Defending women's champion Iga...,"Swiatek, Djokovic headline third round action ...","[-0.019315, 0.066645, 0.009547, 0.029555, -0.0...",10,"Title: Swiatek, Djokovic headline third round ...",Sports and Competition
3,rlh53czyst054zfn,JAKARTA – Hopes of a return to democracy in ju...,‘Systematic repression’ crushing Myanmar’s dem...,"[0.067328, -0.004407, 0.010127, -0.004268, -0....",4,Title: ‘Systematic repression’ crushing Myanma...,Political Crises and Human Rights Concerns
4,aksixz7uun2gkpss,JERUSALEM - Israel's shekel dropped to it...,Israel's shekel falls as judicial showdown looms,"[-0.043186, 0.076352, -0.015492, -0.02859, -0....",18,Title: Israel's shekel falls as judicial showd...,Politics and Elections


In [12]:
%%time
def fetch_tags(article_pair):
    article_text, article_id = article_pair
    time.sleep(1)  
    return article_id, ["tag1", "tag2", "tag3"]

def process_articles(df, max_workers=10):
    results = {}
    batch_size = 100
    cooldown_period = 10  

    articles = df['combined'].tolist()
    ids = df['id'].tolist()
    article_id_pairs = list(zip(articles, ids))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in range(0, len(article_id_pairs), batch_size):
            current_batch = article_id_pairs[i:i+batch_size]
            print(f"Starting batch processing for articles {i+1} to {min(i+batch_size, len(article_id_pairs))}")
            futures = {executor.submit(fetch_tags, pair): pair for pair in current_batch}

            processed_count = i
            for future in as_completed(futures):
                article_id, tags = future.result()
                results[article_id] = tags
                processed_count += 1
                print(f"Processed {processed_count}")

            if processed_count >= len(article_id_pairs):
                return results

            print(f"All tasks in batch {i//batch_size + 1} completed, cooling down for {cooldown_period} seconds...")
            time.sleep(cooldown_period)
    return results

tags = process_articles(df)


  0%|          | 0/2 [00:00<?, ?it/s]

Starting batch processing for articles 1 to 100
Processed 1
Processed 2
Processed 3
Processed 4
Processed 5
Processed 6
Processed 7
Processed 8
Processed 9
Processed 10
Processed 11
Processed 12
Processed 13
Processed 14
Processed 15
Processed 16
Processed 17
Processed 18
Processed 19
Processed 20
Processed 21
Processed 22
Processed 23
Processed 24
Processed 25
Processed 26
Processed 27
Processed 28
Processed 29
Processed 30
Processed 31
Processed 32
Processed 33
Processed 34
Processed 35
Processed 36
Processed 37
Processed 38
Processed 39
Processed 40
Processed 41
Processed 42
Processed 43
Processed 44
Processed 45
Processed 46
Processed 47
Processed 48
Processed 49
Processed 50
Processed 51
Processed 52
Processed 53
Processed 54
Processed 55
Processed 56
Processed 57
Processed 58
Processed 59
Processed 60
Processed 61
Processed 62
Processed 63
Processed 64
Processed 65
Processed 66
Processed 67
Processed 68
Processed 69
Processed 70
Processed 71
Processed 72
Processed 73
Processed 74

 50%|█████     | 1/2 [00:20<00:20, 20.04s/it]

Starting batch processing for articles 101 to 200
Processed 101
Processed 102
Processed 103
Processed 104
Processed 105
Processed 106
Processed 107
Processed 108
Processed 109
Processed 110
Processed 111
Processed 112
Processed 113
Processed 114
Processed 115
Processed 116
Processed 117
Processed 118
Processed 119
Processed 120
Processed 121
Processed 122
Processed 123
Processed 124
Processed 125
Processed 126
Processed 127
Processed 128
Processed 129
Processed 130
Processed 131
Processed 132
Processed 133
Processed 134
Processed 135
Processed 136
Processed 137
Processed 138
Processed 139
Processed 140
Processed 141
Processed 142
Processed 143
Processed 144
Processed 145
Processed 146
Processed 147
Processed 148
Processed 149
Processed 150
Processed 151
Processed 152
Processed 153
Processed 154
Processed 155
Processed 156
Processed 157
Processed 158
Processed 159
Processed 160
Processed 161
Processed 162
Processed 163
Processed 164
Processed 165
Processed 166
Processed 167
Processed 16

 50%|█████     | 1/2 [00:30<00:30, 30.08s/it]

Processed 191
Processed 192
Processed 193
Processed 194
Processed 195
Processed 196
Processed 197
Processed 198
Processed 199
Processed 200
CPU times: user 58.2 ms, sys: 29.4 ms, total: 87.7 ms
Wall time: 30.1 s



