In [2]:
# !pip3 install orjson

In [3]:
import json
import tldextract
import gzip
import shutil
import orjson

In [4]:
def process_index_file(file_name):
    print('Unzipping index file ... ')

    file_unzipped = file_name.split('.gz')[0]

    with gzip.open(file_name, 'rb') as f_in:
        with open(file_unzipped, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
            
    return file_unzipped

def process_index_file_line(line):
    assert type(line)==str
    
    try:
        lst = line.replace('\n','').split()
        ts = lst[1]
        data = orjson.loads(line.replace('\n','').split(ts)[-1].strip())
    except:
        return ()
    
    if data['status'] != '200':
        return ()
    else:
        try:
            language = data['languages']
        except:
            language = 'none'
            
        try:
            _tldextract = tldextract.extract(data['url'])
            tup = (ts,
                   data['url'],
                   _tldextract.suffix,
                   data['length'],
                   data['offset'],
                   data['filename'],
                   language              
                )
            return tup
        except:
            return ()

In [5]:
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def crawl(url):
    try:
        return url, requests.get(url, timeout = 15).content.decode()
    except:
        return None
    
MAX_PARALLEL_REQUESTS = 100

In [6]:
from glob import glob
import os
from tqdm import tqdm
from urllib.parse import urlparse
import pandas as pd

files = glob('data/*.gz')
for f in files:
    new_f = f.split('.gz')[0]
    filename = os.path.split(f)[1].replace('.gz', '.parquet')
    
    print(f, new_f, filename)
    
    if os.path.exists(filename):
        continue
    
    if not os.path.exists(new_f):
        process_index_file(f)
        
    
    filtered = []
    
    with open(new_f, encoding = "ISO-8859-1") as fopen:
        for l in tqdm(fopen):
            try:
                l = process_index_file_line(l)

                if l[-1] == 'msa':
                    filtered.append(l)
                    continue

                domain = urlparse(l[1]).netloc
                if domain.endswith('.my'):
                    filtered.append(l)
                    continue

            except:
                pass
    
    df = pd.DataFrame(filtered, columns = ['ts','url','tld','length','offset','warc','language'])
    print(df.shape)
    df.to_parquet(filename, index = False)
    
#     try:
    
#         crawled = []
#         with ThreadPoolExecutor(max_workers=MAX_PARALLEL_REQUESTS) as executor:
#             futures = {executor.submit(crawl, df['url'].iloc[i]): df['url'].iloc[i] for i in range(len(df))}

#         for future in tqdm(as_completed(futures), total=len(futures)):
#             crawled.append(future.result())

#         df_urls = pd.DataFrame(crawled, columns = ['url', 'web'])
#         df = df.merge(df_urls, on = 'url')
#         df.to_parquet(filename, index = False)
    
#     except:
#         pass
    
    os.remove(new_f)
#     os.remove(f)

data/cdx-00238.gz data/cdx-00238 cdx-00238.parquet
data/cdx-00133.gz data/cdx-00133 cdx-00133.parquet
data/cdx-00175.gz data/cdx-00175 cdx-00175.parquet
data/cdx-00278.gz data/cdx-00278 cdx-00278.parquet
data/cdx-00131.gz data/cdx-00131 cdx-00131.parquet
data/cdx-00227.gz data/cdx-00227 cdx-00227.parquet
Unzipping index file ... 


14159142it [01:59, 118910.44it/s]


(3588053, 7)
data/cdx-00235.gz data/cdx-00235 cdx-00235.parquet
Unzipping index file ... 


12586957it [01:44, 120966.23it/s]


(1177, 7)
data/cdx-00112.gz data/cdx-00112 cdx-00112.parquet


12307555it [01:40, 122584.44it/s]


(1233, 7)
data/cdx-00090.gz data/cdx-00090 cdx-00090.parquet
Unzipping index file ... 


11820725it [01:36, 122898.20it/s]


(744, 7)
data/cdx-00219.gz data/cdx-00219 cdx-00219.parquet
Unzipping index file ... 


11450540it [01:33, 122011.56it/s]


(3, 7)
data/cdx-00097.gz data/cdx-00097 cdx-00097.parquet
Unzipping index file ... 


13845459it [01:54, 120645.74it/s]


(955, 7)
data/cdx-00162.gz data/cdx-00162 cdx-00162.parquet
Unzipping index file ... 


12812797it [01:44, 122354.72it/s]


(603, 7)
data/cdx-00115.gz data/cdx-00115 cdx-00115.parquet
Unzipping index file ... 


12872284it [01:47, 120065.05it/s]


(649, 7)
data/cdx-00178.gz data/cdx-00178 cdx-00178.parquet
Unzipping index file ... 


10236594it [01:29, 94991.90it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

13432886it [01:51, 120756.25it/s]


(2015, 7)
data/cdx-00226.gz data/cdx-00226 cdx-00226.parquet
Unzipping index file ... 


15007350it [02:02, 122112.18it/s]


(1968, 7)
data/cdx-00192.gz data/cdx-00192 cdx-00192.parquet
Unzipping index file ... 


13971618it [01:55, 120848.15it/s]


(133, 7)
data/cdx-00084.gz data/cdx-00084 cdx-00084.parquet


12784284it [01:45, 121714.26it/s]


(1423, 7)
data/cdx-00174.gz data/cdx-00174 cdx-00174.parquet
Unzipping index file ... 


12435904it [01:44, 118825.40it/s]


(4, 7)
data/cdx-00243.gz data/cdx-00243 cdx-00243.parquet
Unzipping index file ... 


14748510it [01:59, 123239.54it/s]


(6, 7)
data/cdx-00102.gz data/cdx-00102 cdx-00102.parquet
Unzipping index file ... 


13960666it [01:54, 121543.43it/s]


(1029, 7)
data/cdx-00293.gz data/cdx-00293 cdx-00293.parquet
Unzipping index file ... 


12887858it [01:48, 118260.67it/s]


(2658, 7)
data/cdx-00283.gz data/cdx-00283 cdx-00283.parquet
Unzipping index file ... 


13009187it [01:43, 125584.16it/s]


(10, 7)
data/cdx-00260.gz data/cdx-00260 cdx-00260.parquet
Unzipping index file ... 


10527679it [01:26, 121073.21it/s]


(2834, 7)
data/cdx-00149.gz data/cdx-00149 cdx-00149.parquet
Unzipping index file ... 


12830949it [01:48, 117719.10it/s]


(1175, 7)
data/cdx-00187.gz data/cdx-00187 cdx-00187.parquet
Unzipping index file ... 


12035657it [01:41, 119016.19it/s]


(339, 7)
data/cdx-00209.gz data/cdx-00209 cdx-00209.parquet
Unzipping index file ... 


16642176it [02:21, 117796.88it/s]


(239, 7)
data/cdx-00221.gz data/cdx-00221 cdx-00221.parquet
Unzipping index file ... 


12884184it [01:43, 124198.89it/s]


(11, 7)
data/cdx-00280.gz data/cdx-00280 cdx-00280.parquet
Unzipping index file ... 


8064741it [01:02, 134472.56it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

15250811it [02:12, 115349.42it/s]


(14, 7)
data/cdx-00255.gz data/cdx-00255 cdx-00255.parquet
Unzipping index file ... 


11488433it [01:34, 121149.67it/s]


(200, 7)
data/cdx-00261.gz data/cdx-00261 cdx-00261.parquet
Unzipping index file ... 


10576237it [01:25, 123665.37it/s]


(607, 7)
data/cdx-00239.gz data/cdx-00239 cdx-00239.parquet
Unzipping index file ... 


14208665it [01:58, 114089.83it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

11790872it [01:36, 121823.92it/s]


(182, 7)
data/cdx-00232.gz data/cdx-00232 cdx-00232.parquet
Unzipping index file ... 


12275275it [01:40, 121645.73it/s]


(1858, 7)
data/cdx-00256.gz data/cdx-00256 cdx-00256.parquet
Unzipping index file ... 


11858130it [01:37, 121201.40it/s]


(1048, 7)
data/cdx-00118.gz data/cdx-00118 cdx-00118.parquet
Unzipping index file ... 


8952550it [01:13, 121175.11it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

11304904it [01:38, 115344.41it/s]


(2, 7)
data/cdx-00190.gz data/cdx-00190 cdx-00190.parquet
Unzipping index file ... 


13326853it [01:57, 113333.72it/s]


(378, 7)
data/cdx-00143.gz data/cdx-00143 cdx-00143.parquet
Unzipping index file ... 


15142178it [02:05, 120397.23it/s]


(3405, 7)
data/cdx-00195.gz data/cdx-00195 cdx-00195.parquet
Unzipping index file ... 


12921574it [01:47, 119823.15it/s]


(15, 7)
data/cdx-00176.gz data/cdx-00176 cdx-00176.parquet
Unzipping index file ... 


11497095it [01:35, 121006.54it/s]


(77, 7)
data/cdx-00285.gz data/cdx-00285 cdx-00285.parquet
Unzipping index file ... 


13568444it [01:49, 123807.94it/s]


(26, 7)
data/cdx-00124.gz data/cdx-00124 cdx-00124.parquet
Unzipping index file ... 


14054129it [01:55, 121968.24it/s]


(4394, 7)
data/cdx-00271.gz data/cdx-00271 cdx-00271.parquet
Unzipping index file ... 


10166344it [01:23, 121689.91it/s]


(554, 7)
data/cdx-00135.gz data/cdx-00135 cdx-00135.parquet
Unzipping index file ... 


11879987it [01:35, 124681.77it/s]


(819, 7)
data/cdx-00093.gz data/cdx-00093 cdx-00093.parquet
Unzipping index file ... 


15121920it [02:05, 120053.37it/s]


(515, 7)
data/cdx-00117.gz data/cdx-00117 cdx-00117.parquet
Unzipping index file ... 


13450508it [01:50, 121799.47it/s]


(1383, 7)
data/cdx-00229.gz data/cdx-00229 cdx-00229.parquet
Unzipping index file ... 


12320156it [01:41, 121288.21it/s]


(869, 7)
data/cdx-00080.gz data/cdx-00080 cdx-00080.parquet


15336539it [02:09, 118531.66it/s]


(3587, 7)
data/cdx-00191.gz data/cdx-00191 cdx-00191.parquet
Unzipping index file ... 


13555894it [01:52, 120079.64it/s]


(293, 7)
data/cdx-00086.gz data/cdx-00086 cdx-00086.parquet
Unzipping index file ... 


13043595it [01:48, 120602.91it/s]


(4668, 7)
data/cdx-00120.gz data/cdx-00120 cdx-00120.parquet
Unzipping index file ... 


13113831it [01:47, 121615.56it/s]


(456, 7)
data/cdx-00196.gz data/cdx-00196 cdx-00196.parquet
Unzipping index file ... 


13473869it [01:52, 119397.60it/s]


(18, 7)
data/cdx-00132.gz data/cdx-00132 cdx-00132.parquet
Unzipping index file ... 


12542836it [01:50, 113588.24it/s]


(1591, 7)
data/cdx-00101.gz data/cdx-00101 cdx-00101.parquet
Unzipping index file ... 


10874021it [01:32, 117947.98it/s]


(2136, 7)
data/cdx-00264.gz data/cdx-00264 cdx-00264.parquet
Unzipping index file ... 


11747742it [01:39, 117683.05it/s]


(1319, 7)
data/cdx-00094.gz data/cdx-00094 cdx-00094.parquet
Unzipping index file ... 


12731730it [01:46, 120085.68it/s]


(605, 7)
data/cdx-00244.gz data/cdx-00244 cdx-00244.parquet
Unzipping index file ... 


10843639it [01:32, 117697.77it/s]


(18, 7)
data/cdx-00128.gz data/cdx-00128 cdx-00128.parquet
Unzipping index file ... 


13237031it [01:51, 118553.95it/s]


(3101, 7)
data/cdx-00106.gz data/cdx-00106 cdx-00106.parquet
Unzipping index file ... 


10165115it [01:27, 116421.56it/s]


(254, 7)
data/cdx-00193.gz data/cdx-00193 cdx-00193.parquet
Unzipping index file ... 


13055447it [01:51, 117449.36it/s]


(16, 7)
data/cdx-00111.gz data/cdx-00111 cdx-00111.parquet
Unzipping index file ... 


11475592it [01:35, 119752.76it/s]


(441, 7)
data/cdx-00265.gz data/cdx-00265 cdx-00265.parquet
Unzipping index file ... 


17355582it [02:33, 113191.83it/s]


(19752, 7)
data/cdx-00173.gz data/cdx-00173 cdx-00173.parquet
Unzipping index file ... 


14191807it [01:59, 118953.98it/s]


(118, 7)
data/cdx-00164.gz data/cdx-00164 cdx-00164.parquet
Unzipping index file ... 


14704374it [02:05, 117396.37it/s]


(376, 7)
data/cdx-00282.gz data/cdx-00282 cdx-00282.parquet
Unzipping index file ... 


13438180it [01:47, 124640.95it/s]


(11, 7)
data/cdx-00182.gz data/cdx-00182 cdx-00182.parquet
Unzipping index file ... 


9353659it [01:24, 110185.28it/s]


(2, 7)
data/cdx-00233.gz data/cdx-00233 cdx-00233.parquet
Unzipping index file ... 


13899643it [01:54, 121044.32it/s]


(1287, 7)
data/cdx-00134.gz data/cdx-00134 cdx-00134.parquet
Unzipping index file ... 


12195406it [01:38, 123197.37it/s]


(571, 7)
data/cdx-00234.gz data/cdx-00234 cdx-00234.parquet
Unzipping index file ... 


14128292it [01:55, 122242.52it/s]


(2200, 7)
data/cdx-00096.gz data/cdx-00096 cdx-00096.parquet
Unzipping index file ... 


12430877it [01:42, 120715.67it/s]


(339, 7)
data/cdx-00224.gz data/cdx-00224 cdx-00224.parquet
Unzipping index file ... 


13524196it [01:53, 119359.90it/s]


(75, 7)
data/cdx-00185.gz data/cdx-00185 cdx-00185.parquet
Unzipping index file ... 


14618543it [02:14, 108738.67it/s]


(89, 7)
data/cdx-00230.gz data/cdx-00230 cdx-00230.parquet
Unzipping index file ... 


13434617it [01:49, 122326.22it/s]


(1101, 7)
data/cdx-00225.gz data/cdx-00225 cdx-00225.parquet
Unzipping index file ... 


9039607it [01:11, 127011.28it/s]


(348, 7)
data/cdx-00121.gz data/cdx-00121 cdx-00121.parquet
Unzipping index file ... 


14935096it [02:03, 120972.26it/s]


(2199, 7)
data/cdx-00274.gz data/cdx-00274 cdx-00274.parquet
Unzipping index file ... 


14906253it [02:04, 119817.42it/s]


(60, 7)
data/cdx-00257.gz data/cdx-00257 cdx-00257.parquet
Unzipping index file ... 


11577081it [01:35, 120656.00it/s]


(579, 7)
data/cdx-00114.gz data/cdx-00114 cdx-00114.parquet
Unzipping index file ... 


14452898it [01:58, 121514.83it/s]


(2475, 7)
data/cdx-00279.gz data/cdx-00279 cdx-00279.parquet
Unzipping index file ... 


14151710it [01:54, 123395.48it/s]


(22, 7)
data/cdx-00268.gz data/cdx-00268 cdx-00268.parquet
Unzipping index file ... 


16187203it [02:15, 119051.87it/s]


(382, 7)
data/cdx-00189.gz data/cdx-00189 cdx-00189.parquet
Unzipping index file ... 


13978922it [02:00, 115656.80it/s]


(130, 7)
data/cdx-00262.gz data/cdx-00262 cdx-00262.parquet
Unzipping index file ... 


12408368it [01:42, 120627.12it/s]


(206, 7)
data/cdx-00179.gz data/cdx-00179 cdx-00179.parquet
Unzipping index file ... 


15717907it [02:23, 109847.71it/s]


(127, 7)
data/cdx-00177.gz data/cdx-00177 cdx-00177.parquet
Unzipping index file ... 


15316754it [02:08, 118813.47it/s]


(13, 7)
data/cdx-00286.gz data/cdx-00286 cdx-00286.parquet
Unzipping index file ... 


9545007it [01:13, 129602.38it/s]


(383, 7)
data/cdx-00298.gz data/cdx-00298 cdx-00298.parquet
Unzipping index file ... 


14933353it [02:21, 105545.81it/s]


(511, 7)
data/cdx-00181.gz data/cdx-00181 cdx-00181.parquet
Unzipping index file ... 


12671103it [01:51, 113511.81it/s]


(33, 7)
data/cdx-00083.gz data/cdx-00083 cdx-00083.parquet


15472504it [02:09, 119233.93it/s]


(1038, 7)
data/cdx-00241.gz data/cdx-00241 cdx-00241.parquet
Unzipping index file ... 


13456363it [01:54, 117847.64it/s]


(188, 7)
data/cdx-00210.gz data/cdx-00210 cdx-00210.parquet
Unzipping index file ... 


12280884it [01:39, 123804.42it/s]


(675, 7)
data/cdx-00296.gz data/cdx-00296 cdx-00296.parquet
Unzipping index file ... 


15209255it [02:06, 119962.43it/s]


(516, 7)
data/cdx-00251.gz data/cdx-00251 cdx-00251.parquet
Unzipping index file ... 


12974192it [01:47, 120543.01it/s]


(550, 7)
data/cdx-00148.gz data/cdx-00148 cdx-00148.parquet
Unzipping index file ... 


14200228it [01:55, 123044.33it/s]


(972, 7)
data/cdx-00169.gz data/cdx-00169 cdx-00169.parquet
Unzipping index file ... 


12797285it [01:48, 117899.79it/s]


(406, 7)
data/cdx-00231.gz data/cdx-00231 cdx-00231.parquet
Unzipping index file ... 


12527049it [01:42, 122290.68it/s]


(861, 7)
data/cdx-00258.gz data/cdx-00258 cdx-00258.parquet
Unzipping index file ... 


9488719it [01:18, 120776.88it/s]


(315, 7)
data/cdx-00277.gz data/cdx-00277 cdx-00277.parquet
Unzipping index file ... 


12655115it [01:42, 123948.24it/s]


(11, 7)
data/cdx-00188.gz data/cdx-00188 cdx-00188.parquet
Unzipping index file ... 


12795450it [01:46, 120058.54it/s]


(19, 7)
data/cdx-00288.gz data/cdx-00288 cdx-00288.parquet
Unzipping index file ... 


11261413it [01:38, 114652.27it/s]


(3580, 7)
data/cdx-00254.gz data/cdx-00254 cdx-00254.parquet
Unzipping index file ... 


13042720it [01:48, 119775.82it/s]


(631, 7)
data/cdx-00145.gz data/cdx-00145 cdx-00145.parquet
Unzipping index file ... 


11779045it [01:36, 122095.43it/s]


(875, 7)
data/cdx-00077.gz data/cdx-00077 cdx-00077.parquet


11914828it [01:38, 120426.46it/s]


(1852, 7)
data/cdx-00116.gz data/cdx-00116 cdx-00116.parquet
Unzipping index file ... 


10856925it [01:30, 120216.65it/s]


(685, 7)
data/cdx-00089.gz data/cdx-00089 cdx-00089.parquet
Unzipping index file ... 


15098366it [02:04, 121721.73it/s]


(1469, 7)
data/cdx-00237.gz data/cdx-00237 cdx-00237.parquet
Unzipping index file ... 


13435531it [01:49, 122417.43it/s]


(742, 7)
data/cdx-00183.gz data/cdx-00183 cdx-00183.parquet
Unzipping index file ... 


15732240it [02:25, 107929.71it/s]


(8, 7)
data/cdx-00198.gz data/cdx-00198 cdx-00198.parquet
Unzipping index file ... 


15061631it [02:08, 117526.30it/s]


(62, 7)
data/cdx-00245.gz data/cdx-00245 cdx-00245.parquet
Unzipping index file ... 


14104633it [01:59, 118211.27it/s]


(1050, 7)
data/cdx-00171.gz data/cdx-00171 cdx-00171.parquet
Unzipping index file ... 


15073793it [02:09, 116684.44it/s]


(13, 7)
data/cdx-00236.gz data/cdx-00236 cdx-00236.parquet
Unzipping index file ... 


13014514it [01:47, 120507.03it/s]


(1890, 7)
data/cdx-00205.gz data/cdx-00205 cdx-00205.parquet
Unzipping index file ... 


13315791it [01:54, 116645.52it/s]


(18988, 7)
data/cdx-00273.gz data/cdx-00273 cdx-00273.parquet
Unzipping index file ... 


14481458it [02:04, 116141.14it/s]


(14, 7)
data/cdx-00085.gz data/cdx-00085 cdx-00085.parquet


13717849it [01:56, 117681.59it/s]


(1041, 7)
data/cdx-00109.gz data/cdx-00109 cdx-00109.parquet
Unzipping index file ... 


13194627it [01:48, 121379.14it/s]


(439, 7)
data/cdx-00076.gz data/cdx-00076 cdx-00076.parquet


14037407it [01:59, 117377.13it/s]


(947, 7)
data/cdx-00269.gz data/cdx-00269 cdx-00269.parquet
Unzipping index file ... 


13710818it [01:58, 115930.91it/s]


(470, 7)
data/cdx-00266.gz data/cdx-00266 cdx-00266.parquet
Unzipping index file ... 


14148466it [01:59, 118253.74it/s]


(535, 7)
data/cdx-00206.gz data/cdx-00206 cdx-00206.parquet
Unzipping index file ... 


13224455it [01:53, 116094.48it/s]


(174, 7)
data/cdx-00130.gz data/cdx-00130 cdx-00130.parquet
Unzipping index file ... 


13462805it [01:52, 119390.96it/s]


(499, 7)
data/cdx-00202.gz data/cdx-00202 cdx-00202.parquet
Unzipping index file ... 


13375321it [01:58, 113012.11it/s]


(63, 7)
data/cdx-00122.gz data/cdx-00122 cdx-00122.parquet
Unzipping index file ... 


13113801it [01:49, 119363.54it/s]


(2603, 7)
data/cdx-00253.gz data/cdx-00253 cdx-00253.parquet
Unzipping index file ... 


13172814it [01:50, 118743.21it/s]


(1388, 7)
data/cdx-00194.gz data/cdx-00194 cdx-00194.parquet
Unzipping index file ... 


14521756it [02:05, 116022.65it/s]


(87, 7)
data/cdx-00180.gz data/cdx-00180 cdx-00180.parquet
Unzipping index file ... 


13490234it [01:53, 118618.72it/s]


(32, 7)
data/cdx-00141.gz data/cdx-00141 cdx-00141.parquet
Unzipping index file ... 


13195162it [01:50, 118907.92it/s]


(6016, 7)
data/cdx-00095.gz data/cdx-00095 cdx-00095.parquet
Unzipping index file ... 


12236207it [01:44, 116936.64it/s]


(21203, 7)
data/cdx-00110.gz data/cdx-00110 cdx-00110.parquet
Unzipping index file ... 


13473493it [01:53, 118222.44it/s]


(2001, 7)
data/cdx-00144.gz data/cdx-00144 cdx-00144.parquet
Unzipping index file ... 


11489176it [01:35, 119691.02it/s]


(2145, 7)
data/cdx-00250.gz data/cdx-00250 cdx-00250.parquet
Unzipping index file ... 


11473585it [01:33, 122654.89it/s]


(393, 7)
data/cdx-00125.gz data/cdx-00125 cdx-00125.parquet
Unzipping index file ... 


12517614it [01:45, 119031.92it/s]


(2874, 7)
data/cdx-00150.gz data/cdx-00150 cdx-00150.parquet
Unzipping index file ... 


17171258it [02:27, 116648.56it/s]


(11541, 7)
data/cdx-00082.gz data/cdx-00082 cdx-00082.parquet


13769861it [01:57, 117269.36it/s]


(1552, 7)
data/cdx-00242.gz data/cdx-00242 cdx-00242.parquet
Unzipping index file ... 


16770890it [02:22, 117309.75it/s]


(103, 7)
data/cdx-00147.gz data/cdx-00147 cdx-00147.parquet
Unzipping index file ... 


12721630it [01:44, 121994.36it/s]


(19767, 7)
data/cdx-00104.gz data/cdx-00104 cdx-00104.parquet
Unzipping index file ... 


13706946it [01:52, 121828.22it/s]


(2937, 7)
data/cdx-00272.gz data/cdx-00272 cdx-00272.parquet
Unzipping index file ... 


12863804it [01:55, 111292.26it/s]


(1230, 7)
data/cdx-00165.gz data/cdx-00165 cdx-00165.parquet
Unzipping index file ... 


14786095it [02:09, 114598.68it/s]


(22, 7)
