# Prepare Corpus
This notebook prepares the newspaper data for analysis of metadata and content. 

In [4]:
import pandas as pd
import os
import re
import sys
from unidecode import unidecode
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

sys.path.append('../')

## Functions

In [5]:
def digit_perc(x):
    '''
    function to calculate the percentage of digits in a text
    '''
    return round(sum(c.isdigit() for c in str(x)) / len(str(x)), 4)

# Preparing Metadata

In [6]:
input_path = '../../../../Dropbox/datasets/newspapers_clean/'
output_path = '../data/'

In [35]:
# here we go through the list of datafiles and preprocess the data and calculate some additional features

results = []
subdirs = ['ads'] # we only select advertisememts
for root, dirs, files in os.walk(input_path):
    if all(subdir in dirs for subdir in subdirs):
        results.append(os.path.join(root,subdirs[0]))


regex_pat = re.compile(r'[^a-zA-Z\s]', flags=re.IGNORECASE)
meta_data = []


for path in results:
    print(path)
    frame = []
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    onlyfiles = [f for f in onlyfiles if f.endswith('.tsv')]
    newspaper_name = path.split('/')[7]
    newspaper_years = []
    for filename in tqdm(onlyfiles):
        print(filename)
        df = pd.read_csv(os.path.join(path, filename), engine='python', index_col=None, delimiter='\t')
        #df['perc_digits'] = df['ocr'].apply(lambda x: digit_perc(x))
        #df['ocr'] = df['ocr'].astype(str)
        #df['ocr'] = df['ocr'].apply(lambda x: unidecode.unidecode(x)) 
        #df['ocr'] = df['ocr'].str.replace(regex_pat, '') #only words
        #df['ocr'] = df['ocr'].str.findall(r'\w{2,}').str.join(' ').str.lower() #only words longer than 2 chars
        df['identifier'] = df['ocr_url'].apply(lambda x: x.split('/')[:][3][12:-4])
        df['string_length'] = df['ocr'].str.len()
        
        if 'size' in df.columns:
            df['character_proportion'] = df['string_length'] / df ['size']
        else:
            df['size'] = df['w'] * df['h']
            df['character_proportion'] = df['string_length'] / df ['size']
        if 'paper_title' in df.columns: #nrc 
            df['newspaper_name'] = df['paper_title']
            df.drop(['issue_id', 'page_id','paper_title','id','area'], axis=1, inplace=True)
        else:
            df['newspaper_name'] = newspaper_name
        
        df.drop(['ocr', 'min_y', 'image_url', 'ocr_url','min_x', 'max_y', 'max_x'], axis=1, inplace=True)
        #ocr.to_csv(os.path.join(path, 'ocr_' + filename), sep='\t', index=None)
        newspaper_years.append(df)

    frame = pd.concat(newspaper_years, axis=0, ignore_index=True)
    meta_data.append(frame)

meta_data = pd.concat(meta_data, axis=0, ignore_index=True)



  0%|          | 0/25 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/nrc/ads
nrc-1970.tsv


  4%|▍         | 1/25 [00:00<00:18,  1.30it/s]

nrc-1971.tsv


  8%|▊         | 2/25 [00:01<00:17,  1.30it/s]

nrc-1973.tsv


 12%|█▏        | 3/25 [00:02<00:17,  1.27it/s]

nrc-1972.tsv


 16%|█▌        | 4/25 [00:03<00:16,  1.24it/s]

nrc-1976.tsv


 20%|██        | 5/25 [00:03<00:15,  1.30it/s]

nrc-1989.tsv


 24%|██▍       | 6/25 [00:05<00:17,  1.09it/s]

nrc-1988.tsv


 28%|██▊       | 7/25 [00:06<00:19,  1.06s/it]

nrc-1977.tsv


 32%|███▏      | 8/25 [00:07<00:17,  1.01s/it]

nrc-1975.tsv


 36%|███▌      | 9/25 [00:08<00:15,  1.05it/s]

nrc-1974.tsv


 40%|████      | 10/25 [00:09<00:13,  1.12it/s]

nrc-1979.tsv


 44%|████▍     | 11/25 [00:09<00:12,  1.16it/s]

nrc-1992.tsv


 48%|████▊     | 12/25 [00:10<00:11,  1.16it/s]

nrc-1986.tsv


 52%|█████▏    | 13/25 [00:11<00:11,  1.08it/s]

nrc-1987.tsv


 56%|█████▌    | 14/25 [00:12<00:10,  1.02it/s]

nrc-1993.tsv


 60%|██████    | 15/25 [00:13<00:09,  1.09it/s]

nrc-1978.tsv


 64%|██████▍   | 16/25 [00:14<00:08,  1.11it/s]

nrc-1985.tsv


 68%|██████▊   | 17/25 [00:15<00:07,  1.09it/s]

nrc-1991.tsv


 72%|███████▏  | 18/25 [00:16<00:06,  1.08it/s]

nrc-1990.tsv


 76%|███████▌  | 19/25 [00:17<00:05,  1.14it/s]

nrc-1984.tsv


 80%|████████  | 20/25 [00:18<00:04,  1.13it/s]

nrc-1980.tsv


 84%|████████▍ | 21/25 [00:18<00:03,  1.17it/s]

nrc-1994.tsv


 88%|████████▊ | 22/25 [00:19<00:02,  1.17it/s]

nrc-1981.tsv


 92%|█████████▏| 23/25 [00:20<00:01,  1.12it/s]

nrc-1983.tsv


 96%|█████████▌| 24/25 [00:21<00:00,  1.14it/s]

nrc-1982.tsv


100%|██████████| 25/25 [00:22<00:00,  1.12it/s]
  0%|          | 0/106 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/detijd/ads
de_tijd_ads_1957.tsv


  2%|▏         | 2/106 [00:00<00:23,  4.48it/s]

de_tijd_ads_1943.tsv
de_tijd_ads_1994.tsv
de_tijd_ads_1980.tsv
de_tijd_ads_1981.tsv
de_tijd_ads_1995.tsv
de_tijd_ads_1942.tsv


  7%|▋         | 7/106 [00:00<00:18,  5.46it/s]

de_tijd_ads_1956.tsv


  8%|▊         | 8/106 [00:01<00:19,  4.94it/s]

de_tijd_ads_1968.tsv
de_tijd_ads_1940.tsv


 10%|█         | 11/106 [00:01<00:17,  5.53it/s]

de_tijd_ads_1954.tsv
de_tijd_ads_1983.tsv
de_tijd_ads_1982.tsv
de_tijd_ads_1955.tsv


 13%|█▎        | 14/106 [00:01<00:14,  6.37it/s]

de_tijd_ads_1941.tsv


 16%|█▌        | 17/106 [00:02<00:18,  4.89it/s]

de_tijd_ads_1969.tsv
de_tijd_ads_1945.tsv
de_tijd_ads_1951.tsv


 17%|█▋        | 18/106 [00:02<00:18,  4.84it/s]

de_tijd_ads_1979.tsv
de_tijd_ads_1986.tsv
de_tijd_ads_1992.tsv
de_tijd_ads_1993.tsv
de_tijd_ads_1987.tsv
de_tijd_ads_1978.tsv
de_tijd_ads_1950.tsv


 25%|██▌       | 27/106 [00:03<00:10,  7.35it/s]

de_tijd_ads_1944.tsv
de_tijd_ads_1952.tsv
de_tijd_ads_1946.tsv


 32%|███▏      | 34/106 [00:03<00:06, 10.90it/s]

de_tijd_ads_1991.tsv
de_tijd_ads_1985.tsv
de_tijd_ads_1984.tsv
de_tijd_ads_1990.tsv
de_tijd_ads_1947.tsv
de_tijd_ads_1953.tsv
de_tijd_ads_1897.tsv
de_tijd_ads_1908.tsv
de_tijd_ads_1934.tsv


 35%|███▍      | 37/106 [00:04<00:10,  6.62it/s]

de_tijd_ads_1920.tsv
de_tijd_ads_1921.tsv


 37%|███▋      | 39/106 [00:04<00:12,  5.55it/s]

de_tijd_ads_1935.tsv
de_tijd_ads_1909.tsv


 39%|███▊      | 41/106 [00:05<00:14,  4.37it/s]

de_tijd_ads_1896.tsv
de_tijd_ads_1894.tsv


 41%|████      | 43/106 [00:05<00:12,  4.89it/s]

de_tijd_ads_1923.tsv


 42%|████▏     | 44/106 [00:06<00:12,  4.85it/s]

de_tijd_ads_1937.tsv


 42%|████▏     | 45/106 [00:06<00:16,  3.61it/s]

de_tijd_ads_1936.tsv


 43%|████▎     | 46/106 [00:06<00:19,  3.03it/s]

de_tijd_ads_1922.tsv


 45%|████▌     | 48/106 [00:07<00:14,  3.88it/s]

de_tijd_ads_1895.tsv
de_tijd_ads_1891.tsv


 46%|████▌     | 49/106 [00:07<00:12,  4.42it/s]

de_tijd_ads_1926.tsv


 47%|████▋     | 50/106 [00:07<00:12,  4.40it/s]

de_tijd_ads_1932.tsv


 48%|████▊     | 51/106 [00:08<00:13,  4.03it/s]

de_tijd_ads_1933.tsv


 49%|████▉     | 52/106 [00:08<00:16,  3.36it/s]

de_tijd_ads_1927.tsv


 51%|█████     | 54/106 [00:08<00:12,  4.33it/s]

de_tijd_ads_1890.tsv
de_tijd_ads_1892.tsv


 52%|█████▏    | 55/106 [00:08<00:10,  4.92it/s]

de_tijd_ads_1931.tsv


 53%|█████▎    | 56/106 [00:09<00:10,  4.67it/s]

de_tijd_ads_1925.tsv


 54%|█████▍    | 57/106 [00:09<00:10,  4.57it/s]

de_tijd_ads_1919.tsv


 56%|█████▌    | 59/106 [00:09<00:10,  4.50it/s]

de_tijd_ads_1918.tsv
de_tijd_ads_1924.tsv


 57%|█████▋    | 60/106 [00:10<00:10,  4.48it/s]

de_tijd_ads_1930.tsv


 58%|█████▊    | 62/106 [00:10<00:11,  3.90it/s]

de_tijd_ads_1893.tsv
de_tijd_ads_1929.tsv


 59%|█████▉    | 63/106 [00:10<00:10,  3.95it/s]

de_tijd_ads_1915.tsv


 61%|██████▏   | 65/106 [00:11<00:09,  4.48it/s]

de_tijd_ads_1901.tsv
de_tijd_ads_1900.tsv


 62%|██████▏   | 66/106 [00:11<00:08,  4.63it/s]

de_tijd_ads_1914.tsv


 63%|██████▎   | 67/106 [00:12<00:11,  3.39it/s]

de_tijd_ads_1928.tsv


 64%|██████▍   | 68/106 [00:12<00:11,  3.25it/s]

de_tijd_ads_1902.tsv


 65%|██████▌   | 69/106 [00:12<00:10,  3.40it/s]

de_tijd_ads_1916.tsv


 66%|██████▌   | 70/106 [00:12<00:10,  3.34it/s]

de_tijd_ads_1917.tsv


 67%|██████▋   | 71/106 [00:13<00:10,  3.21it/s]

de_tijd_ads_1903.tsv


 68%|██████▊   | 72/106 [00:13<00:10,  3.37it/s]

de_tijd_ads_1898.tsv


 69%|██████▉   | 73/106 [00:13<00:09,  3.42it/s]

de_tijd_ads_1907.tsv


 71%|███████   | 75/106 [00:14<00:07,  4.05it/s]

de_tijd_ads_1913.tsv
de_tijd_ads_1912.tsv


 72%|███████▏  | 76/106 [00:14<00:07,  4.22it/s]

de_tijd_ads_1906.tsv


 73%|███████▎  | 77/106 [00:14<00:07,  3.95it/s]

de_tijd_ads_1899.tsv


 74%|███████▎  | 78/106 [00:15<00:07,  3.83it/s]

de_tijd_ads_1910.tsv


 75%|███████▍  | 79/106 [00:15<00:06,  3.94it/s]

de_tijd_ads_1904.tsv


 75%|███████▌  | 80/106 [00:15<00:06,  3.98it/s]

de_tijd_ads_1938.tsv


 76%|███████▋  | 81/106 [00:16<00:08,  2.85it/s]

de_tijd_ads_1939.tsv


 77%|███████▋  | 82/106 [00:16<00:10,  2.32it/s]

de_tijd_ads_1905.tsv


 78%|███████▊  | 83/106 [00:16<00:08,  2.71it/s]

de_tijd_ads_1911.tsv


 79%|███████▉  | 84/106 [00:17<00:07,  3.13it/s]

de_tijd_ads_1976.tsv
de_tijd_ads_1962.tsv
de_tijd_ads_1989.tsv
de_tijd_ads_1988.tsv
de_tijd_ads_1963.tsv
de_tijd_ads_1977.tsv
de_tijd_ads_1949.tsv


 86%|████████▌ | 91/106 [00:17<00:03,  4.27it/s]

de_tijd_ads_1961.tsv
de_tijd_ads_1975.tsv
de_tijd_ads_1974.tsv
de_tijd_ads_1960.tsv
de_tijd_ads_1948.tsv


 91%|█████████ | 96/106 [00:17<00:01,  5.62it/s]

de_tijd_ads_1964.tsv
de_tijd_ads_1970.tsv
de_tijd_ads_1958.tsv


100%|██████████| 106/106 [00:18<00:00,  5.87it/s]


de_tijd_ads_1959.tsv
de_tijd_ads_1971.tsv
de_tijd_ads_1965.tsv
de_tijd_ads_1973.tsv
de_tijd_ads_1967.tsv
de_tijd_ads_1966.tsv
de_tijd_ads_1972.tsv


  0%|          | 0/21 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/nieuwe_rotterdamsche_courant/ads
nieuwe_rotterdamsche_courant_ads_1927.tsv


  5%|▍         | 1/21 [00:01<00:20,  1.04s/it]

nieuwe_rotterdamsche_courant_ads_1926.tsv


 10%|▉         | 2/21 [00:01<00:18,  1.02it/s]

nieuwe_rotterdamsche_courant_ads_1918.tsv


 14%|█▍        | 3/21 [00:02<00:18,  1.01s/it]

nieuwe_rotterdamsche_courant_ads_1924.tsv


 19%|█▉        | 4/21 [00:03<00:16,  1.04it/s]

nieuwe_rotterdamsche_courant_ads_1925.tsv


 24%|██▍       | 5/21 [00:04<00:14,  1.11it/s]

nieuwe_rotterdamsche_courant_ads_1919.tsv


 29%|██▊       | 6/21 [00:05<00:14,  1.03it/s]

nieuwe_rotterdamsche_courant_ads_1921.tsv


 33%|███▎      | 7/21 [00:06<00:13,  1.03it/s]

nieuwe_rotterdamsche_courant_ads_1909.tsv
nieuwe_rotterdamsche_courant_ads_1920.tsv


 43%|████▎     | 9/21 [00:07<00:10,  1.19it/s]

nieuwe_rotterdamsche_courant_ads_1922.tsv


 48%|████▊     | 10/21 [00:08<00:08,  1.27it/s]

nieuwe_rotterdamsche_courant_ads_1923.tsv


 52%|█████▏    | 11/21 [00:09<00:07,  1.25it/s]

nieuwe_rotterdamsche_courant_ads_1912.tsv


 57%|█████▋    | 12/21 [00:09<00:07,  1.28it/s]

nieuwe_rotterdamsche_courant_ads_1913.tsv


 62%|██████▏   | 13/21 [00:10<00:06,  1.30it/s]

nieuwe_rotterdamsche_courant_ads_1911.tsv


 67%|██████▋   | 14/21 [00:11<00:05,  1.32it/s]

nieuwe_rotterdamsche_courant_ads_1910.tsv


 71%|███████▏  | 15/21 [00:12<00:04,  1.34it/s]

nieuwe_rotterdamsche_courant_ads_1914.tsv


 76%|███████▌  | 16/21 [00:12<00:03,  1.35it/s]

nieuwe_rotterdamsche_courant_ads_1928.tsv


 81%|████████  | 17/21 [00:13<00:03,  1.31it/s]

nieuwe_rotterdamsche_courant_ads_1929.tsv


 86%|████████▌ | 18/21 [00:14<00:02,  1.26it/s]

nieuwe_rotterdamsche_courant_ads_1915.tsv


 90%|█████████ | 19/21 [00:15<00:01,  1.30it/s]

nieuwe_rotterdamsche_courant_ads_1917.tsv


 95%|█████████▌| 20/21 [00:16<00:00,  1.22it/s]

nieuwe_rotterdamsche_courant_ads_1916.tsv


100%|██████████| 21/21 [00:17<00:00,  1.23it/s]
  0%|          | 0/105 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/nn/ads
nieuwsblad_noorden_1901.tsv


  1%|          | 1/105 [00:00<00:40,  2.55it/s]

nieuwsblad_noorden_1915.tsv


  2%|▏         | 2/105 [00:00<00:46,  2.20it/s]

nieuwsblad_noorden_1929.tsv


  3%|▎         | 3/105 [00:01<00:56,  1.79it/s]

nieuwsblad_noorden_1928.tsv


  4%|▍         | 4/105 [00:02<01:03,  1.59it/s]

nieuwsblad_noorden_1914.tsv


  6%|▌         | 6/105 [00:03<00:46,  2.12it/s]

nieuwsblad_noorden_1900.tsv
nieuwsblad_noorden_1916.tsv


  7%|▋         | 7/105 [00:03<00:51,  1.91it/s]

nieuwsblad_noorden_1902.tsv


  8%|▊         | 8/105 [00:04<00:48,  2.00it/s]

nieuwsblad_noorden_1903.tsv


  9%|▊         | 9/105 [00:04<00:40,  2.35it/s]

nieuwsblad_noorden_1917.tsv


 10%|▉         | 10/105 [00:05<00:47,  2.00it/s]

nieuwsblad_noorden_1913.tsv


 10%|█         | 11/105 [00:06<00:54,  1.73it/s]

nieuwsblad_noorden_1907.tsv


 12%|█▏        | 13/105 [00:06<00:40,  2.26it/s]

nieuwsblad_noorden_1898.tsv
nieuwsblad_noorden_1899.tsv


 13%|█▎        | 14/105 [00:06<00:32,  2.77it/s]

nieuwsblad_noorden_1906.tsv


 14%|█▍        | 15/105 [00:07<00:37,  2.40it/s]

nieuwsblad_noorden_1912.tsv


 15%|█▌        | 16/105 [00:08<00:43,  2.03it/s]

nieuwsblad_noorden_1938.tsv


 16%|█▌        | 17/105 [00:08<00:50,  1.76it/s]

nieuwsblad_noorden_1904.tsv


 17%|█▋        | 18/105 [00:09<00:49,  1.76it/s]

nieuwsblad_noorden_1910.tsv


 18%|█▊        | 19/105 [00:09<00:45,  1.89it/s]

nieuwsblad_noorden_1911.tsv


 20%|██        | 21/105 [00:10<00:41,  2.04it/s]

nieuwsblad_noorden_1905.tsv
nieuwsblad_noorden_1939.tsv


 21%|██        | 22/105 [00:11<00:45,  1.83it/s]

nieuwsblad_noorden_1989.tsv


 22%|██▏       | 23/105 [00:13<01:18,  1.05it/s]

nieuwsblad_noorden_1962.tsv


 23%|██▎       | 24/105 [00:13<01:04,  1.26it/s]

nieuwsblad_noorden_1976.tsv


 24%|██▍       | 25/105 [00:15<01:10,  1.13it/s]

nieuwsblad_noorden_1977.tsv


 25%|██▍       | 26/105 [00:16<01:18,  1.01it/s]

nieuwsblad_noorden_1963.tsv


 26%|██▌       | 27/105 [00:16<01:08,  1.13it/s]

nieuwsblad_noorden_1988.tsv


 27%|██▋       | 28/105 [00:18<01:18,  1.02s/it]

nieuwsblad_noorden_1975.tsv


 28%|██▊       | 29/105 [00:19<01:19,  1.04s/it]

nieuwsblad_noorden_1961.tsv


 30%|██▉       | 31/105 [00:20<00:51,  1.43it/s]

nieuwsblad_noorden_1949.tsv
nieuwsblad_noorden_1948.tsv


 30%|███       | 32/105 [00:20<00:40,  1.82it/s]

nieuwsblad_noorden_1960.tsv


 31%|███▏      | 33/105 [00:21<00:42,  1.71it/s]

nieuwsblad_noorden_1974.tsv


 32%|███▏      | 34/105 [00:22<00:54,  1.30it/s]

nieuwsblad_noorden_1958.tsv


 33%|███▎      | 35/105 [00:22<00:49,  1.42it/s]

nieuwsblad_noorden_1970.tsv


 34%|███▍      | 36/105 [00:23<00:56,  1.21it/s]

nieuwsblad_noorden_1964.tsv


 35%|███▌      | 37/105 [00:24<00:55,  1.22it/s]

nieuwsblad_noorden_1965.tsv


 36%|███▌      | 38/105 [00:25<00:56,  1.18it/s]

nieuwsblad_noorden_1971.tsv


 37%|███▋      | 39/105 [00:26<01:00,  1.09it/s]

nieuwsblad_noorden_1959.tsv


 38%|███▊      | 40/105 [00:27<00:54,  1.19it/s]

nieuwsblad_noorden_1967.tsv


 39%|███▉      | 41/105 [00:28<00:54,  1.18it/s]

nieuwsblad_noorden_1973.tsv


 40%|████      | 42/105 [00:29<00:56,  1.11it/s]

nieuwsblad_noorden_1972.tsv


 41%|████      | 43/105 [00:30<00:59,  1.04it/s]

nieuwsblad_noorden_1966.tsv


 42%|████▏     | 44/105 [00:31<00:58,  1.05it/s]

nieuwsblad_noorden_1980.tsv


 43%|████▎     | 45/105 [00:32<01:06,  1.11s/it]

nieuwsblad_noorden_1994.tsv


 45%|████▍     | 47/105 [00:33<00:46,  1.25it/s]

nieuwsblad_noorden_1943.tsv
nieuwsblad_noorden_1957.tsv


 46%|████▌     | 48/105 [00:34<00:40,  1.40it/s]

nieuwsblad_noorden_1956.tsv


 47%|████▋     | 49/105 [00:34<00:37,  1.51it/s]

nieuwsblad_noorden_1942.tsv


 48%|████▊     | 50/105 [00:35<00:30,  1.79it/s]

nieuwsblad_noorden_1981.tsv


 49%|████▊     | 51/105 [00:36<00:41,  1.30it/s]

nieuwsblad_noorden_1983.tsv


 50%|████▉     | 52/105 [00:37<00:47,  1.11it/s]

nieuwsblad_noorden_1954.tsv


 50%|█████     | 53/105 [00:38<00:40,  1.28it/s]

nieuwsblad_noorden_1940.tsv


 51%|█████▏    | 54/105 [00:38<00:35,  1.42it/s]

nieuwsblad_noorden_1968.tsv


 52%|█████▏    | 55/105 [00:39<00:38,  1.28it/s]

nieuwsblad_noorden_1969.tsv


 53%|█████▎    | 56/105 [00:40<00:40,  1.20it/s]

nieuwsblad_noorden_1941.tsv


 54%|█████▍    | 57/105 [00:41<00:34,  1.40it/s]

nieuwsblad_noorden_1955.tsv


 55%|█████▌    | 58/105 [00:41<00:27,  1.72it/s]

nieuwsblad_noorden_1982.tsv


 56%|█████▌    | 59/105 [00:42<00:35,  1.29it/s]

nieuwsblad_noorden_1992.tsv


 57%|█████▋    | 60/105 [00:43<00:43,  1.04it/s]

nieuwsblad_noorden_1986.tsv


 58%|█████▊    | 61/105 [00:45<00:46,  1.05s/it]

nieuwsblad_noorden_1979.tsv


 59%|█████▉    | 62/105 [00:46<00:51,  1.19s/it]

nieuwsblad_noorden_1951.tsv


 62%|██████▏   | 65/105 [00:47<00:27,  1.47it/s]

nieuwsblad_noorden_1945.tsv
nieuwsblad_noorden_1944.tsv
nieuwsblad_noorden_1950.tsv


 63%|██████▎   | 66/105 [00:47<00:21,  1.82it/s]

nieuwsblad_noorden_1978.tsv


 64%|██████▍   | 67/105 [00:48<00:30,  1.26it/s]

nieuwsblad_noorden_1987.tsv


 65%|██████▍   | 68/105 [00:50<00:35,  1.03it/s]

nieuwsblad_noorden_1993.tsv


 66%|██████▌   | 69/105 [00:51<00:39,  1.09s/it]

nieuwsblad_noorden_1985.tsv


 67%|██████▋   | 70/105 [00:52<00:41,  1.17s/it]

nieuwsblad_noorden_1991.tsv


 69%|██████▊   | 72/105 [00:54<00:30,  1.07it/s]

nieuwsblad_noorden_1946.tsv
nieuwsblad_noorden_1952.tsv


 70%|██████▉   | 73/105 [00:55<00:24,  1.29it/s]

nieuwsblad_noorden_1953.tsv


 70%|███████   | 74/105 [00:55<00:19,  1.56it/s]

nieuwsblad_noorden_1947.tsv


 71%|███████▏  | 75/105 [00:55<00:15,  1.94it/s]

nieuwsblad_noorden_1990.tsv


 72%|███████▏  | 76/105 [00:57<00:23,  1.24it/s]

nieuwsblad_noorden_1984.tsv


 74%|███████▍  | 78/105 [00:58<00:19,  1.36it/s]

nieuwsblad_noorden_1920.tsv
nieuwsblad_noorden_1934.tsv


 75%|███████▌  | 79/105 [00:59<00:18,  1.40it/s]

nieuwsblad_noorden_1908.tsv


 77%|███████▋  | 81/105 [00:59<00:12,  1.99it/s]

nieuwsblad_noorden_1897.tsv
nieuwsblad_noorden_1896.tsv


 78%|███████▊  | 82/105 [01:00<00:09,  2.43it/s]

nieuwsblad_noorden_1909.tsv


 79%|███████▉  | 83/105 [01:00<00:07,  2.84it/s]

nieuwsblad_noorden_1935.tsv


 80%|████████  | 84/105 [01:01<00:09,  2.25it/s]

nieuwsblad_noorden_1921.tsv
nieuwsblad_noorden_1937.tsv


 82%|████████▏ | 86/105 [01:01<00:07,  2.41it/s]

nieuwsblad_noorden_1923.tsv


 84%|████████▍ | 88/105 [01:02<00:07,  2.25it/s]

nieuwsblad_noorden_1894.tsv
nieuwsblad_noorden_1895.tsv


 85%|████████▍ | 89/105 [01:02<00:05,  2.81it/s]

nieuwsblad_noorden_1922.tsv


 86%|████████▌ | 90/105 [01:03<00:05,  2.51it/s]

nieuwsblad_noorden_1936.tsv


 87%|████████▋ | 91/105 [01:04<00:06,  2.13it/s]

nieuwsblad_noorden_1932.tsv


 88%|████████▊ | 92/105 [01:04<00:07,  1.85it/s]

nieuwsblad_noorden_1926.tsv


 89%|████████▊ | 93/105 [01:05<00:07,  1.71it/s]

nieuwsblad_noorden_1891.tsv
nieuwsblad_noorden_1890.tsv
nieuwsblad_noorden_1927.tsv


 91%|█████████▏| 96/105 [01:06<00:04,  2.03it/s]

nieuwsblad_noorden_1933.tsv


 92%|█████████▏| 97/105 [01:07<00:04,  1.80it/s]

nieuwsblad_noorden_1919.tsv


 93%|█████████▎| 98/105 [01:07<00:04,  1.68it/s]

nieuwsblad_noorden_1925.tsv


 94%|█████████▍| 99/105 [01:08<00:03,  1.61it/s]

nieuwsblad_noorden_1931.tsv


 96%|█████████▌| 101/105 [01:09<00:01,  2.06it/s]

nieuwsblad_noorden_1892.tsv
nieuwsblad_noorden_1893.tsv


 97%|█████████▋| 102/105 [01:09<00:01,  2.69it/s]

nieuwsblad_noorden_1930.tsv


 98%|█████████▊| 103/105 [01:10<00:00,  2.11it/s]

nieuwsblad_noorden_1924.tsv


 99%|█████████▉| 104/105 [01:10<00:00,  1.82it/s]

nieuwsblad_noorden_1918.tsv


100%|██████████| 105/105 [01:11<00:00,  1.47it/s]
  0%|          | 0/99 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/telegraaf/ads
telegraaf_ads_1988.tsv


  1%|          | 1/99 [00:03<05:50,  3.57s/it]

telegraaf_ads_1977.tsv


  2%|▏         | 2/99 [00:06<05:32,  3.43s/it]

telegraaf_ads_1963.tsv


  3%|▎         | 3/99 [00:08<04:28,  2.80s/it]

telegraaf_ads_1962.tsv


  4%|▍         | 4/99 [00:09<03:42,  2.34s/it]

telegraaf_ads_1976.tsv


  5%|▌         | 5/99 [00:11<03:44,  2.39s/it]

telegraaf_ads_1989.tsv


  6%|▌         | 6/99 [00:16<04:33,  2.94s/it]

telegraaf_ads_1960.tsv


  7%|▋         | 7/99 [00:17<03:40,  2.40s/it]

telegraaf_ads_1974.tsv


  9%|▉         | 9/99 [00:19<02:37,  1.75s/it]

telegraaf_ads_1949.tsv
telegraaf_ads_1975.tsv


 10%|█         | 10/99 [00:22<02:53,  1.95s/it]

telegraaf_ads_1961.tsv


 11%|█         | 11/99 [00:23<02:31,  1.73s/it]

telegraaf_ads_1959.tsv


 12%|█▏        | 12/99 [00:24<02:14,  1.55s/it]

telegraaf_ads_1965.tsv


 13%|█▎        | 13/99 [00:26<02:18,  1.61s/it]

telegraaf_ads_1971.tsv


 14%|█▍        | 14/99 [00:28<02:35,  1.83s/it]

telegraaf_ads_1970.tsv


 15%|█▌        | 15/99 [00:31<02:48,  2.00s/it]

telegraaf_ads_1964.tsv


 16%|█▌        | 16/99 [00:32<02:38,  1.90s/it]

telegraaf_ads_1958.tsv


 17%|█▋        | 17/99 [00:33<02:13,  1.63s/it]

telegraaf_ads_1972.tsv


 18%|█▊        | 18/99 [00:35<02:26,  1.81s/it]

telegraaf_ads_1966.tsv


 19%|█▉        | 19/99 [00:37<02:27,  1.84s/it]

telegraaf_ads_1967.tsv


 20%|██        | 20/99 [00:39<02:25,  1.84s/it]

telegraaf_ads_1973.tsv


 22%|██▏       | 22/99 [00:42<01:54,  1.49s/it]

telegraaf_ads_1914.tsv
telegraaf_ads_1900.tsv


 23%|██▎       | 23/99 [00:42<01:26,  1.14s/it]

telegraaf_ads_1928.tsv


 24%|██▍       | 24/99 [00:43<01:23,  1.12s/it]

telegraaf_ads_1929.tsv


 25%|██▌       | 25/99 [00:45<01:23,  1.13s/it]

telegraaf_ads_1901.tsv


 26%|██▋       | 26/99 [00:45<01:04,  1.14it/s]

telegraaf_ads_1915.tsv


 27%|██▋       | 27/99 [00:45<00:53,  1.35it/s]

telegraaf_ads_1903.tsv


 28%|██▊       | 28/99 [00:46<00:43,  1.63it/s]

telegraaf_ads_1917.tsv


 29%|██▉       | 29/99 [00:46<00:42,  1.64it/s]

telegraaf_ads_1916.tsv


 30%|███       | 30/99 [00:47<00:38,  1.77it/s]

telegraaf_ads_1902.tsv


 31%|███▏      | 31/99 [00:47<00:32,  2.06it/s]

telegraaf_ads_1899.tsv


 32%|███▏      | 32/99 [00:47<00:29,  2.25it/s]

telegraaf_ads_1906.tsv


 33%|███▎      | 33/99 [00:48<00:29,  2.27it/s]

telegraaf_ads_1912.tsv


 34%|███▍      | 34/99 [00:48<00:27,  2.34it/s]

telegraaf_ads_1913.tsv


 35%|███▌      | 35/99 [00:49<00:28,  2.22it/s]

telegraaf_ads_1907.tsv


 36%|███▋      | 36/99 [00:49<00:27,  2.26it/s]

telegraaf_ads_1898.tsv


 37%|███▋      | 37/99 [00:49<00:25,  2.41it/s]

telegraaf_ads_1939.tsv


 38%|███▊      | 38/99 [00:50<00:37,  1.61it/s]

telegraaf_ads_1911.tsv


 39%|███▉      | 39/99 [00:51<00:34,  1.74it/s]

telegraaf_ads_1905.tsv


 40%|████      | 40/99 [00:51<00:29,  2.03it/s]

telegraaf_ads_1904.tsv


 41%|████▏     | 41/99 [00:52<00:26,  2.18it/s]

telegraaf_ads_1910.tsv


 42%|████▏     | 42/99 [00:52<00:26,  2.19it/s]

telegraaf_ads_1938.tsv


 43%|████▎     | 43/99 [00:53<00:38,  1.44it/s]

telegraaf_ads_1896.tsv


 44%|████▍     | 44/99 [00:54<00:31,  1.73it/s]

telegraaf_ads_1935.tsv


 45%|████▌     | 45/99 [00:55<00:38,  1.40it/s]

telegraaf_ads_1921.tsv


 46%|████▋     | 46/99 [00:55<00:38,  1.39it/s]

telegraaf_ads_1909.tsv


 47%|████▋     | 47/99 [00:56<00:33,  1.57it/s]

telegraaf_ads_1908.tsv


 48%|████▊     | 48/99 [00:56<00:29,  1.72it/s]

telegraaf_ads_1920.tsv


 49%|████▉     | 49/99 [00:57<00:28,  1.73it/s]

telegraaf_ads_1934.tsv


 51%|█████     | 50/99 [00:58<00:37,  1.29it/s]

telegraaf_ads_1897.tsv


 52%|█████▏    | 51/99 [00:58<00:30,  1.57it/s]

telegraaf_ads_1895.tsv


 53%|█████▎    | 52/99 [00:59<00:24,  1.92it/s]

telegraaf_ads_1922.tsv


 54%|█████▎    | 53/99 [00:59<00:25,  1.78it/s]

telegraaf_ads_1936.tsv


 55%|█████▍    | 54/99 [01:00<00:33,  1.35it/s]

telegraaf_ads_1937.tsv


 56%|█████▌    | 55/99 [01:02<00:38,  1.14it/s]

telegraaf_ads_1923.tsv


 57%|█████▋    | 56/99 [01:03<00:38,  1.12it/s]

telegraaf_ads_1894.tsv


 58%|█████▊    | 57/99 [01:03<00:29,  1.43it/s]

telegraaf_ads_1927.tsv


 59%|█████▊    | 58/99 [01:04<00:33,  1.23it/s]

telegraaf_ads_1933.tsv


 60%|█████▉    | 59/99 [01:05<00:36,  1.09it/s]

telegraaf_ads_1932.tsv


 61%|██████    | 60/99 [01:06<00:37,  1.03it/s]

telegraaf_ads_1926.tsv


 63%|██████▎   | 62/99 [01:08<00:29,  1.27it/s]

telegraaf_ads_1893.tsv
telegraaf_ads_1918.tsv


 64%|██████▎   | 63/99 [01:08<00:25,  1.43it/s]

telegraaf_ads_1930.tsv


 65%|██████▍   | 64/99 [01:09<00:31,  1.10it/s]

telegraaf_ads_1924.tsv


 66%|██████▌   | 65/99 [01:10<00:31,  1.08it/s]

telegraaf_ads_1925.tsv


 67%|██████▋   | 66/99 [01:11<00:30,  1.06it/s]

telegraaf_ads_1931.tsv


 68%|██████▊   | 67/99 [01:13<00:32,  1.00s/it]

telegraaf_ads_1919.tsv


 69%|██████▊   | 68/99 [01:13<00:28,  1.09it/s]

telegraaf_ads_1981.tsv


 70%|██████▉   | 69/99 [01:16<00:47,  1.57s/it]

telegraaf_ads_1956.tsv


 71%|███████   | 70/99 [01:17<00:40,  1.40s/it]

telegraaf_ads_1942.tsv


 72%|███████▏  | 71/99 [01:18<00:32,  1.17s/it]

telegraaf_ads_1943.tsv


 73%|███████▎  | 72/99 [01:18<00:24,  1.10it/s]

telegraaf_ads_1957.tsv


 74%|███████▎  | 73/99 [01:19<00:23,  1.10it/s]

telegraaf_ads_1980.tsv


 75%|███████▍  | 74/99 [01:23<00:40,  1.62s/it]

telegraaf_ads_1994.tsv


 76%|███████▌  | 75/99 [01:28<01:06,  2.78s/it]

telegraaf_ads_1982.tsv


 77%|███████▋  | 76/99 [01:31<01:04,  2.78s/it]

telegraaf_ads_1941.tsv


 78%|███████▊  | 77/99 [01:32<00:49,  2.25s/it]

telegraaf_ads_1955.tsv


 79%|███████▉  | 78/99 [01:33<00:37,  1.79s/it]

telegraaf_ads_1969.tsv


 80%|███████▉  | 79/99 [01:35<00:40,  2.05s/it]

telegraaf_ads_1968.tsv


 81%|████████  | 80/99 [01:37<00:39,  2.10s/it]

telegraaf_ads_1954.tsv


 82%|████████▏ | 81/99 [01:38<00:29,  1.67s/it]

telegraaf_ads_1940.tsv


 83%|████████▎ | 82/99 [01:39<00:23,  1.41s/it]

telegraaf_ads_1983.tsv


 84%|████████▍ | 83/99 [01:43<00:34,  2.18s/it]

telegraaf_ads_1987.tsv


 85%|████████▍ | 84/99 [01:47<00:39,  2.65s/it]

telegraaf_ads_1993.tsv


 86%|████████▌ | 85/99 [01:51<00:46,  3.30s/it]

telegraaf_ads_1978.tsv


 87%|████████▋ | 86/99 [01:55<00:44,  3.39s/it]

telegraaf_ads_1944.tsv


 88%|████████▊ | 87/99 [01:55<00:29,  2.47s/it]

telegraaf_ads_1950.tsv


 89%|████████▉ | 88/99 [01:56<00:20,  1.86s/it]

telegraaf_ads_1951.tsv


 90%|████████▉ | 89/99 [01:56<00:14,  1.49s/it]

telegraaf_ads_1945.tsv
telegraaf_ads_1979.tsv


 92%|█████████▏| 91/99 [02:00<00:12,  1.62s/it]

telegraaf_ads_1992.tsv


 93%|█████████▎| 92/99 [02:05<00:17,  2.51s/it]

telegraaf_ads_1986.tsv


 94%|█████████▍| 93/99 [02:08<00:16,  2.72s/it]

telegraaf_ads_1990.tsv


 95%|█████████▍| 94/99 [02:13<00:16,  3.29s/it]

telegraaf_ads_1984.tsv


 96%|█████████▌| 95/99 [02:16<00:12,  3.20s/it]

telegraaf_ads_1953.tsv


 97%|█████████▋| 96/99 [02:16<00:07,  2.41s/it]

telegraaf_ads_1952.tsv


 98%|█████████▊| 97/99 [02:17<00:03,  1.82s/it]

telegraaf_ads_1985.tsv


 99%|█████████▉| 98/99 [02:20<00:02,  2.15s/it]

telegraaf_ads_1991.tsv


100%|██████████| 99/99 [02:24<00:00,  1.46s/it]
  0%|          | 0/27 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/vaderland/ads
vaderland_ads_1938.tsv


  4%|▎         | 1/27 [00:00<00:09,  2.71it/s]

vaderland_ads_1939.tsv


  7%|▋         | 2/27 [00:00<00:09,  2.77it/s]

vaderland_ads_1929.tsv


 11%|█         | 3/27 [00:01<00:09,  2.52it/s]

vaderland_ads_1928.tsv


 22%|██▏       | 6/27 [00:01<00:06,  3.17it/s]

vaderland_ads_1945.tsv
vaderland_ads_1944.tsv
vaderland_ads_1940.tsv


 26%|██▌       | 7/27 [00:02<00:06,  3.32it/s]

vaderland_ads_1941.tsv


 33%|███▎      | 9/27 [00:02<00:04,  3.85it/s]

vaderland_ads_1943.tsv
vaderland_ads_1942.tsv


 37%|███▋      | 10/27 [00:02<00:04,  3.79it/s]

vaderland_ads_1919.tsv
vaderland_ads_1931.tsv


 44%|████▍     | 12/27 [00:03<00:03,  3.92it/s]

vaderland_ads_1925.tsv


 48%|████▊     | 13/27 [00:03<00:03,  3.59it/s]

vaderland_ads_1924.tsv


 52%|█████▏    | 14/27 [00:03<00:03,  3.36it/s]

vaderland_ads_1930.tsv


 56%|█████▌    | 15/27 [00:04<00:04,  2.52it/s]

vaderland_ads_1926.tsv


 59%|█████▉    | 16/27 [00:04<00:04,  2.54it/s]

vaderland_ads_1932.tsv


 63%|██████▎   | 17/27 [00:05<00:04,  2.41it/s]

vaderland_ads_1933.tsv


 67%|██████▋   | 18/27 [00:05<00:03,  2.29it/s]

vaderland_ads_1927.tsv


 70%|███████   | 19/27 [00:06<00:03,  2.25it/s]

vaderland_ads_1923.tsv


 74%|███████▍  | 20/27 [00:06<00:02,  2.39it/s]

vaderland_ads_1937.tsv


 78%|███████▊  | 21/27 [00:07<00:02,  2.38it/s]

vaderland_ads_1936.tsv


 81%|████████▏ | 22/27 [00:07<00:02,  2.32it/s]

vaderland_ads_1922.tsv


 85%|████████▌ | 23/27 [00:07<00:01,  2.43it/s]

vaderland_ads_1934.tsv


 89%|████████▉ | 24/27 [00:08<00:01,  2.25it/s]

vaderland_ads_1920.tsv


 93%|█████████▎| 25/27 [00:08<00:00,  2.34it/s]

vaderland_ads_1921.tsv


 96%|█████████▋| 26/27 [00:09<00:00,  2.39it/s]

vaderland_ads_1935.tsv


100%|██████████| 27/27 [00:09<00:00,  2.78it/s]
  0%|          | 0/76 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/algemeen_handelsblad/ads
ah_ads_1927.tsv


  1%|▏         | 1/76 [00:00<01:09,  1.09it/s]

ah_ads_1933.tsv


  3%|▎         | 2/76 [00:01<01:00,  1.22it/s]

ah_ads_1890.tsv
ah_ads_1891.tsv
ah_ads_1932.tsv


  7%|▋         | 5/76 [00:02<00:46,  1.51it/s]

ah_ads_1926.tsv


  8%|▊         | 6/76 [00:03<00:51,  1.36it/s]

ah_ads_1930.tsv


  9%|▉         | 7/76 [00:04<00:56,  1.23it/s]

ah_ads_1924.tsv


 11%|█         | 8/76 [00:04<00:51,  1.33it/s]

ah_ads_1918.tsv
ah_ads_1893.tsv
ah_ads_1892.tsv
ah_ads_1925.tsv


 16%|█▌        | 12/76 [00:05<00:38,  1.68it/s]

ah_ads_1931.tsv


 18%|█▊        | 14/76 [00:06<00:33,  1.86it/s]

ah_ads_1909.tsv
ah_ads_1935.tsv


 20%|█▉        | 15/76 [00:07<00:30,  2.03it/s]

ah_ads_1921.tsv


 21%|██        | 16/76 [00:08<00:33,  1.77it/s]

ah_ads_1896.tsv
ah_ads_1897.tsv
ah_ads_1920.tsv
ah_ads_1934.tsv


 29%|██▉       | 22/76 [00:08<00:18,  2.97it/s]

ah_ads_1908.tsv
ah_ads_1922.tsv
ah_ads_1936.tsv


 30%|███       | 23/76 [00:08<00:14,  3.60it/s]

ah_ads_1895.tsv
ah_ads_1894.tsv
ah_ads_1937.tsv


 34%|███▍      | 26/76 [00:09<00:11,  4.49it/s]

ah_ads_1923.tsv


 36%|███▌      | 27/76 [00:10<00:18,  2.64it/s]

ah_ads_1950.tsv


 37%|███▋      | 28/76 [00:10<00:18,  2.53it/s]

ah_ads_1951.tsv


 38%|███▊      | 29/76 [00:10<00:16,  2.88it/s]

ah_ads_1945.tsv
ah_ads_1953.tsv


 41%|████      | 31/76 [00:11<00:14,  3.14it/s]

ah_ads_1947.tsv


 43%|████▎     | 33/76 [00:11<00:11,  3.61it/s]

ah_ads_1946.tsv
ah_ads_1952.tsv


 45%|████▍     | 34/76 [00:12<00:13,  3.15it/s]

ah_ads_1956.tsv


 46%|████▌     | 35/76 [00:12<00:15,  2.59it/s]

ah_ads_1957.tsv


 47%|████▋     | 36/76 [00:13<00:17,  2.22it/s]

ah_ads_1969.tsv


 49%|████▊     | 37/76 [00:13<00:19,  2.03it/s]

ah_ads_1955.tsv


 50%|█████     | 38/76 [00:14<00:19,  2.00it/s]

ah_ads_1954.tsv


 51%|█████▏    | 39/76 [00:14<00:18,  2.05it/s]

ah_ads_1968.tsv


 53%|█████▎    | 40/76 [00:15<00:18,  1.90it/s]

ah_ads_1965.tsv


 54%|█████▍    | 41/76 [00:16<00:19,  1.77it/s]

ah_ads_1959.tsv


 55%|█████▌    | 42/76 [00:16<00:19,  1.70it/s]

ah_ads_1958.tsv


 57%|█████▋    | 43/76 [00:17<00:19,  1.70it/s]

ah_ads_1970.tsv


 58%|█████▊    | 44/76 [00:17<00:16,  1.90it/s]

ah_ads_1964.tsv


 59%|█████▉    | 45/76 [00:18<00:16,  1.89it/s]

ah_ads_1966.tsv


 61%|██████    | 46/76 [00:18<00:16,  1.80it/s]

ah_ads_1967.tsv


 62%|██████▏   | 47/76 [00:19<00:16,  1.78it/s]

ah_ads_1963.tsv


 63%|██████▎   | 48/76 [00:20<00:15,  1.76it/s]

ah_ads_1962.tsv


 66%|██████▌   | 50/76 [00:20<00:12,  2.15it/s]

ah_ads_1948.tsv
ah_ads_1960.tsv


 67%|██████▋   | 51/76 [00:21<00:13,  1.92it/s]

ah_ads_1961.tsv


 68%|██████▊   | 52/76 [00:22<00:13,  1.79it/s]

ah_ads_1949.tsv


 71%|███████   | 54/76 [00:22<00:08,  2.52it/s]

ah_ads_1906.tsv
ah_ads_1912.tsv
ah_ads_1899.tsv
ah_ads_1898.tsv
ah_ads_1913.tsv
ah_ads_1907.tsv


 78%|███████▊  | 59/76 [00:22<00:04,  3.46it/s]

ah_ads_1911.tsv
ah_ads_1905.tsv
ah_ads_1939.tsv


 82%|████████▏ | 62/76 [00:23<00:03,  3.98it/s]

ah_ads_1964-11.tsv
ah_ads_1938.tsv


 84%|████████▍ | 64/76 [00:23<00:03,  3.92it/s]

ah_ads_1904.tsv
ah_ads_1910.tsv
ah_ads_1928.tsv


 88%|████████▊ | 67/76 [00:24<00:02,  3.69it/s]

ah_ads_1914.tsv
ah_ads_1900.tsv
ah_ads_1901.tsv
ah_ads_1915.tsv
ah_ads_1929.tsv


100%|██████████| 76/76 [00:25<00:00,  2.94it/s]


ah_ads_1903.tsv
ah_ads_1917.tsv
ah_ads_1916.tsv
ah_ads_1902.tsv


  0%|          | 0/27 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/het_volk/ads
volk_ads_1926.tsv


  4%|▎         | 1/27 [00:00<00:07,  3.49it/s]

volk_ads_1932.tsv


  7%|▋         | 2/27 [00:00<00:07,  3.28it/s]

volk_ads_1933.tsv


 11%|█         | 3/27 [00:00<00:07,  3.25it/s]

volk_ads_1927.tsv


 19%|█▊        | 5/27 [00:01<00:06,  3.60it/s]

volk_ads_1919.tsv
volk_ads_1931.tsv


 22%|██▏       | 6/27 [00:01<00:06,  3.31it/s]

volk_ads_1925.tsv


 26%|██▌       | 7/27 [00:02<00:06,  3.26it/s]

volk_ads_1924.tsv


 30%|██▉       | 8/27 [00:02<00:05,  3.59it/s]

volk_ads_1930.tsv


 33%|███▎      | 9/27 [00:02<00:05,  3.41it/s]

volk_ads_1934.tsv
volk_ads_1920.tsv


 44%|████▍     | 12/27 [00:03<00:03,  4.65it/s]

volk_ads_1921.tsv
volk_ads_1935.tsv
volk_ads_1923.tsv


 63%|██████▎   | 17/27 [00:03<00:01,  7.01it/s]

volk_ads_1937.tsv
volk_ads_1936.tsv
volk_ads_1922.tsv
volk_ads_1945.tsv
volk_ads_1944.tsv


 70%|███████   | 19/27 [00:03<00:00,  8.35it/s]

volk_ads_1943.tsv
volk_ads_1942.tsv


 78%|███████▊  | 21/27 [00:03<00:00,  7.65it/s]

volk_ads_1940.tsv


 85%|████████▌ | 23/27 [00:04<00:00,  6.10it/s]

volk_ads_1941.tsv
volk_ads_1938.tsv
volk_ads_1939.tsv
volk_ads_1929.tsv


 96%|█████████▋| 26/27 [00:04<00:00,  6.69it/s]

volk_ads_1928.tsv


100%|██████████| 27/27 [00:04<00:00,  5.45it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/trouw/ads
trouw_ads_1952.tsv


  4%|▍         | 2/50 [00:00<00:10,  4.45it/s]

trouw_ads_1946.tsv
trouw_ads_1991.tsv


  6%|▌         | 3/50 [00:01<00:22,  2.09it/s]

trouw_ads_1985.tsv


  8%|▊         | 4/50 [00:02<00:29,  1.58it/s]

trouw_ads_1984.tsv


 10%|█         | 5/50 [00:03<00:32,  1.38it/s]

trouw_ads_1990.tsv


 12%|█▏        | 6/50 [00:04<00:37,  1.18it/s]

trouw_ads_1947.tsv


 14%|█▍        | 7/50 [00:04<00:28,  1.48it/s]

trouw_ads_1953.tsv


 16%|█▌        | 8/50 [00:05<00:24,  1.73it/s]

trouw_ads_1951.tsv


 18%|█▊        | 9/50 [00:05<00:19,  2.05it/s]

trouw_ads_1979.tsv


 20%|██        | 10/50 [00:06<00:30,  1.31it/s]

trouw_ads_1986.tsv


 22%|██▏       | 11/50 [00:07<00:32,  1.19it/s]

trouw_ads_1992.tsv


 24%|██▍       | 12/50 [00:08<00:35,  1.08it/s]

trouw_ads_1993.tsv


 26%|██▌       | 13/50 [00:10<00:36,  1.00it/s]

trouw_ads_1987.tsv


 28%|██▊       | 14/50 [00:11<00:35,  1.00it/s]

trouw_ads_1978.tsv


 32%|███▏      | 16/50 [00:12<00:26,  1.27it/s]

trouw_ads_1950.tsv
trouw_ads_1968.tsv


 34%|███▍      | 17/50 [00:13<00:25,  1.31it/s]

trouw_ads_1954.tsv


 36%|███▌      | 18/50 [00:13<00:21,  1.51it/s]

trouw_ads_1983.tsv


 38%|███▊      | 19/50 [00:14<00:22,  1.36it/s]

trouw_ads_1982.tsv


 40%|████      | 20/50 [00:15<00:25,  1.19it/s]

trouw_ads_1955.tsv


 42%|████▏     | 21/50 [00:16<00:21,  1.37it/s]

trouw_ads_1969.tsv


 44%|████▍     | 22/50 [00:16<00:20,  1.38it/s]

trouw_ads_1957.tsv


 46%|████▌     | 23/50 [00:17<00:17,  1.54it/s]

trouw_ads_1994.tsv


 48%|████▊     | 24/50 [00:18<00:21,  1.23it/s]

trouw_ads_1980.tsv


 50%|█████     | 25/50 [00:19<00:24,  1.04it/s]

trouw_ads_1981.tsv


 52%|█████▏    | 26/50 [00:20<00:24,  1.02s/it]

trouw_ads_1995.tsv


 54%|█████▍    | 27/50 [00:22<00:24,  1.07s/it]

trouw_ads_1956.tsv


 56%|█████▌    | 28/50 [00:22<00:19,  1.12it/s]

trouw_ads_1973.tsv


 58%|█████▊    | 29/50 [00:23<00:17,  1.22it/s]

trouw_ads_1967.tsv


 60%|██████    | 30/50 [00:23<00:15,  1.32it/s]

trouw_ads_1966.tsv


 62%|██████▏   | 31/50 [00:24<00:14,  1.36it/s]

trouw_ads_1972.tsv


 64%|██████▍   | 32/50 [00:25<00:12,  1.45it/s]

trouw_ads_1964.tsv


 66%|██████▌   | 33/50 [00:25<00:10,  1.60it/s]

trouw_ads_1970.tsv


 68%|██████▊   | 34/50 [00:26<00:10,  1.58it/s]

trouw_ads_1958.tsv


 70%|███████   | 35/50 [00:26<00:08,  1.74it/s]

trouw_ads_1959.tsv


 72%|███████▏  | 36/50 [00:27<00:07,  1.81it/s]

trouw_ads_1971.tsv


 74%|███████▍  | 37/50 [00:27<00:07,  1.82it/s]

trouw_ads_1965.tsv


 76%|███████▌  | 38/50 [00:28<00:06,  1.80it/s]

trouw_ads_1949.tsv


 78%|███████▊  | 39/50 [00:28<00:05,  2.19it/s]

trouw_ads_1961.tsv


 80%|████████  | 40/50 [00:29<00:04,  2.08it/s]

trouw_ads_1975.tsv


 82%|████████▏ | 41/50 [00:29<00:04,  1.99it/s]

trouw_ads_1974.tsv


 84%|████████▍ | 42/50 [00:30<00:04,  1.84it/s]

trouw_ads_1960.tsv


 86%|████████▌ | 43/50 [00:30<00:03,  1.89it/s]

trouw_ads_1948.tsv


 88%|████████▊ | 44/50 [00:31<00:02,  2.12it/s]

trouw_ads_1976.tsv


 90%|█████████ | 45/50 [00:31<00:02,  1.94it/s]

trouw_ads_1962.tsv


 92%|█████████▏| 46/50 [00:32<00:02,  1.98it/s]

trouw_ads_1989.tsv


 94%|█████████▍| 47/50 [00:33<00:02,  1.48it/s]

trouw_ads_1988.tsv


 96%|█████████▌| 48/50 [00:34<00:01,  1.25it/s]

trouw_ads_1963.tsv


 98%|█████████▊| 49/50 [00:34<00:00,  1.42it/s]

trouw_ads_1977.tsv


100%|██████████| 50/50 [00:35<00:00,  1.41it/s]
  0%|          | 0/56 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/vk/ads
vk_ads_1979.tsv


  2%|▏         | 1/56 [00:01<01:19,  1.44s/it]

vk_ads_1945.tsv
vk_ads_1951.tsv


  5%|▌         | 3/56 [00:01<00:56,  1.06s/it]

vk_ads_1986.tsv


  7%|▋         | 4/56 [00:03<01:02,  1.19s/it]

vk_ads_1992.tsv


  9%|▉         | 5/56 [00:04<01:07,  1.32s/it]

vk_ads_1993.tsv


 11%|█         | 6/56 [00:06<01:09,  1.38s/it]

vk_ads_1987.tsv


 12%|█▎        | 7/56 [00:07<01:09,  1.43s/it]

vk_ads_1950.tsv


 14%|█▍        | 8/56 [00:08<00:52,  1.10s/it]

vk_ads_1944.tsv
vk_ads_1978.tsv


 18%|█▊        | 10/56 [00:09<00:44,  1.03it/s]

vk_ads_1952.tsv


 21%|██▏       | 12/56 [00:10<00:26,  1.67it/s]

vk_ads_1946.tsv
vk_ads_1991.tsv


 23%|██▎       | 13/56 [00:11<00:40,  1.07it/s]

vk_ads_1985.tsv


 25%|██▌       | 14/56 [00:13<00:45,  1.09s/it]

vk_ads_1984.tsv


 27%|██▋       | 15/56 [00:14<00:45,  1.12s/it]

vk_ads_1990.tsv


 29%|██▊       | 16/56 [00:16<00:50,  1.26s/it]

vk_ads_1947.tsv


 30%|███       | 17/56 [00:16<00:37,  1.04it/s]

vk_ads_1953.tsv


 32%|███▏      | 18/56 [00:16<00:30,  1.26it/s]

vk_ads_1957.tsv


 34%|███▍      | 19/56 [00:17<00:26,  1.40it/s]

vk_ads_1943.tsv
vk_ads_1994.tsv


 38%|███▊      | 21/56 [00:18<00:25,  1.37it/s]

vk_ads_1980.tsv


 39%|███▉      | 22/56 [00:20<00:32,  1.05it/s]

vk_ads_1981.tsv


 41%|████      | 23/56 [00:21<00:34,  1.04s/it]

vk_ads_1995.tsv


 43%|████▎     | 24/56 [00:23<00:38,  1.21s/it]

vk_ads_1942.tsv
vk_ads_1956.tsv


 48%|████▊     | 27/56 [00:23<00:19,  1.47it/s]

vk_ads_1940.tsv
vk_ads_1954.tsv


 50%|█████     | 28/56 [00:24<00:16,  1.65it/s]

vk_ads_1968.tsv


 52%|█████▏    | 29/56 [00:25<00:18,  1.43it/s]

vk_ads_1983.tsv


 54%|█████▎    | 30/56 [00:26<00:20,  1.30it/s]

vk_ads_1982.tsv


 55%|█████▌    | 31/56 [00:27<00:21,  1.15it/s]

vk_ads_1969.tsv


 57%|█████▋    | 32/56 [00:28<00:22,  1.09it/s]

vk_ads_1955.tsv


 59%|█████▉    | 33/56 [00:28<00:18,  1.26it/s]

vk_ads_1941.tsv
vk_ads_1958.tsv


 62%|██████▎   | 35/56 [00:29<00:13,  1.53it/s]

vk_ads_1964.tsv


 64%|██████▍   | 36/56 [00:30<00:14,  1.39it/s]

vk_ads_1970.tsv


 66%|██████▌   | 37/56 [00:31<00:14,  1.28it/s]

vk_ads_1971.tsv


 68%|██████▊   | 38/56 [00:31<00:14,  1.25it/s]

vk_ads_1965.tsv


 70%|██████▉   | 39/56 [00:32<00:14,  1.19it/s]

vk_ads_1959.tsv


 71%|███████▏  | 40/56 [00:33<00:12,  1.25it/s]

vk_ads_1973.tsv


 73%|███████▎  | 41/56 [00:34<00:12,  1.23it/s]

vk_ads_1967.tsv


 75%|███████▌  | 42/56 [00:35<00:11,  1.24it/s]

vk_ads_1966.tsv


 77%|███████▋  | 43/56 [00:36<00:10,  1.21it/s]

vk_ads_1972.tsv


 79%|███████▊  | 44/56 [00:36<00:09,  1.21it/s]

vk_ads_1976.tsv


 80%|████████  | 45/56 [00:37<00:09,  1.20it/s]

vk_ads_1962.tsv


 82%|████████▏ | 46/56 [00:38<00:08,  1.24it/s]

vk_ads_1989.tsv


 84%|████████▍ | 47/56 [00:40<00:09,  1.01s/it]

vk_ads_1988.tsv


 86%|████████▌ | 48/56 [00:41<00:09,  1.17s/it]

vk_ads_1963.tsv


 88%|████████▊ | 49/56 [00:42<00:07,  1.06s/it]

vk_ads_1977.tsv


 89%|████████▉ | 50/56 [00:43<00:06,  1.05s/it]

vk_ads_1961.tsv


 91%|█████████ | 51/56 [00:44<00:04,  1.04it/s]

vk_ads_1975.tsv


 93%|█████████▎| 52/56 [00:45<00:03,  1.06it/s]

vk_ads_1949.tsv


 95%|█████████▍| 53/56 [00:45<00:02,  1.32it/s]

vk_ads_1948.tsv


 96%|█████████▋| 54/56 [00:45<00:01,  1.62it/s]

vk_ads_1974.tsv


 98%|█████████▊| 55/56 [00:46<00:00,  1.41it/s]

vk_ads_1960.tsv


100%|██████████| 56/56 [00:47<00:00,  1.18it/s]
  0%|          | 0/47 [00:00<?, ?it/s]

../../../../Dropbox/datasets/newspapers_clean/vv/ads
vrije_volk_ads_1947.tsv


  2%|▏         | 1/47 [00:00<00:13,  3.52it/s]

vrije_volk_ads_1953.tsv


  4%|▍         | 2/47 [00:00<00:15,  2.85it/s]

vrije_volk_ads_1984.tsv


  6%|▋         | 3/47 [00:02<00:32,  1.37it/s]

vrije_volk_ads_1990.tsv


  9%|▊         | 4/47 [00:04<00:43,  1.02s/it]

vrije_volk_ads_1991.tsv


 11%|█         | 5/47 [00:04<00:35,  1.18it/s]

vrije_volk_ads_1985.tsv


 13%|█▎        | 6/47 [00:05<00:42,  1.03s/it]

vrije_volk_ads_1952.tsv


 17%|█▋        | 8/47 [00:06<00:25,  1.52it/s]

vrije_volk_ads_1946.tsv
vrije_volk_ads_1950.tsv


 19%|█▉        | 9/47 [00:07<00:21,  1.75it/s]

vrije_volk_ads_1978.tsv


 21%|██▏       | 10/47 [00:08<00:36,  1.03it/s]

vrije_volk_ads_1987.tsv


 23%|██▎       | 11/47 [00:10<00:40,  1.12s/it]

vrije_volk_ads_1986.tsv


 26%|██▌       | 12/47 [00:11<00:43,  1.25s/it]

vrije_volk_ads_1979.tsv


 30%|██▉       | 14/47 [00:14<00:35,  1.07s/it]

vrije_volk_ads_1945.tsv
vrije_volk_ads_1951.tsv


 32%|███▏      | 15/47 [00:14<00:28,  1.13it/s]

vrije_volk_ads_1969.tsv


 34%|███▍      | 16/47 [00:15<00:31,  1.02s/it]

vrije_volk_ads_1955.tsv


 36%|███▌      | 17/47 [00:16<00:27,  1.08it/s]

vrije_volk_ads_1982.tsv


 38%|███▊      | 18/47 [00:18<00:32,  1.10s/it]

vrije_volk_ads_1983.tsv


 40%|████      | 19/47 [00:19<00:34,  1.21s/it]

vrije_volk_ads_1954.tsv


 43%|████▎     | 20/47 [00:20<00:27,  1.03s/it]

vrije_volk_ads_1968.tsv


 45%|████▍     | 21/47 [00:21<00:28,  1.09s/it]

vrije_volk_ads_1956.tsv


 47%|████▋     | 22/47 [00:22<00:24,  1.01it/s]

vrije_volk_ads_1981.tsv


 49%|████▉     | 23/47 [00:23<00:29,  1.24s/it]

vrije_volk_ads_1980.tsv


 51%|█████     | 24/47 [00:25<00:33,  1.45s/it]

vrije_volk_ads_1957.tsv


 53%|█████▎    | 25/47 [00:26<00:27,  1.27s/it]

vrije_volk_ads_1966.tsv


 55%|█████▌    | 26/47 [00:28<00:26,  1.27s/it]

vrije_volk_ads_1972.tsv


 57%|█████▋    | 27/47 [00:29<00:27,  1.39s/it]

vrije_volk_ads_1973.tsv


 60%|█████▉    | 28/47 [00:31<00:27,  1.47s/it]

vrije_volk_ads_1967.tsv


 62%|██████▏   | 29/47 [00:32<00:24,  1.37s/it]

vrije_volk_ads_1971.tsv


 64%|██████▍   | 30/47 [00:33<00:22,  1.34s/it]

vrije_volk_ads_1965.tsv


 66%|██████▌   | 31/47 [00:35<00:21,  1.32s/it]

vrije_volk_ads_1959.tsv


 68%|██████▊   | 32/47 [00:35<00:18,  1.21s/it]

vrije_volk_ads_1958.tsv


 70%|███████   | 33/47 [00:36<00:15,  1.10s/it]

vrije_volk_ads_1964.tsv


 72%|███████▏  | 34/47 [00:37<00:13,  1.07s/it]

vrije_volk_ads_1970.tsv


 74%|███████▍  | 35/47 [00:39<00:13,  1.16s/it]

vrije_volk_ads_1948.tsv


 77%|███████▋  | 36/47 [00:39<00:09,  1.10it/s]

vrije_volk_ads_1974.tsv


 79%|███████▊  | 37/47 [00:41<00:11,  1.15s/it]

vrije_volk_ads_1960.tsv


 81%|████████  | 38/47 [00:42<00:10,  1.13s/it]

vrije_volk_ads_1961.tsv


 83%|████████▎ | 39/47 [00:43<00:08,  1.09s/it]

vrije_volk_ads_1975.tsv


 85%|████████▌ | 40/47 [00:44<00:08,  1.23s/it]

vrije_volk_ads_1949.tsv


 87%|████████▋ | 41/47 [00:45<00:05,  1.04it/s]

vrije_volk_ads_1963.tsv


 89%|████████▉ | 42/47 [00:46<00:04,  1.03it/s]

vrije_volk_ads_1977.tsv


 91%|█████████▏| 43/47 [00:47<00:04,  1.20s/it]

vrije_volk_ads_1988.tsv


 94%|█████████▎| 44/47 [00:49<00:03,  1.28s/it]

vrije_volk_ads_1989.tsv


 96%|█████████▌| 45/47 [00:51<00:02,  1.40s/it]

vrije_volk_ads_1976.tsv


 98%|█████████▊| 46/47 [00:52<00:01,  1.44s/it]

vrije_volk_ads_1962.tsv


100%|██████████| 47/47 [00:53<00:00,  1.14s/it]
  3%|▎         | 2/77 [00:00<00:04, 15.80it/s]

../../../../Dropbox/datasets/newspapers_clean/steenwijker_courant/ads
steenwijker_courant_1877.tsv
steenwijker_courant_1888.tsv
steenwijker_courant_1917.tsv


  6%|▋         | 5/77 [00:00<00:05, 12.85it/s]

steenwijker_courant_1903.tsv
steenwijker_courant_1902.tsv
steenwijker_courant_1916.tsv


 10%|█         | 8/77 [00:00<00:05, 13.10it/s]

steenwijker_courant_1889.tsv
steenwijker_courant_1876.tsv
steenwijker_courant_1874.tsv
steenwijker_courant_1928.tsv


 16%|█▌        | 12/77 [00:00<00:04, 13.99it/s]

steenwijker_courant_1900.tsv
steenwijker_courant_1914.tsv
steenwijker_courant_1915.tsv
steenwijker_courant_1901.tsv


 21%|██        | 16/77 [00:01<00:04, 14.29it/s]

steenwijker_courant_1929.tsv
steenwijker_courant_1875.tsv
steenwijker_courant_1871.tsv
steenwijker_courant_1905.tsv


 23%|██▎       | 18/77 [00:01<00:03, 14.77it/s]

steenwijker_courant_1911.tsv
steenwijker_courant_1939.tsv


 26%|██▌       | 20/77 [00:01<00:04, 12.60it/s]

steenwijker_courant_1938.tsv
steenwijker_courant_1910.tsv


 31%|███       | 24/77 [00:01<00:04, 12.79it/s]

steenwijker_courant_1904.tsv
steenwijker_courant_1870.tsv
steenwijker_courant_1872.tsv
steenwijker_courant_1899.tsv


 34%|███▍      | 26/77 [00:01<00:03, 14.23it/s]

steenwijker_courant_1912.tsv
steenwijker_courant_1906.tsv


 39%|███▉      | 30/77 [00:02<00:03, 11.81it/s]

steenwijker_courant_1907.tsv
steenwijker_courant_1913.tsv
steenwijker_courant_1898.tsv


 42%|████▏     | 32/77 [00:02<00:03, 13.29it/s]

steenwijker_courant_1873.tsv
steenwijker_courant_1941.tsv
steenwijker_courant_1940.tsv


 49%|████▉     | 38/77 [00:02<00:02, 15.04it/s]

steenwijker_courant_1942.tsv
steenwijker_courant_1943.tsv
steenwijker_courant_1944.tsv
steenwijker_courant_1945.tsv
steenwijker_courant_1881.tsv
steenwijker_courant_1895.tsv


 55%|█████▍    | 42/77 [00:03<00:02, 13.40it/s]

steenwijker_courant_1936.tsv
steenwijker_courant_1922.tsv
steenwijker_courant_1923.tsv


 57%|█████▋    | 44/77 [00:03<00:02, 13.74it/s]

steenwijker_courant_1937.tsv
steenwijker_courant_1894.tsv


 60%|█████▉    | 46/77 [00:03<00:02, 11.07it/s]

steenwijker_courant_1880.tsv
steenwijker_courant_1869.tsv
steenwijker_courant_1896.tsv


 65%|██████▍   | 50/77 [00:03<00:02, 11.40it/s]

steenwijker_courant_1882.tsv
steenwijker_courant_1909.tsv
steenwijker_courant_1921.tsv


 68%|██████▊   | 52/77 [00:03<00:02, 11.00it/s]

steenwijker_courant_1935.tsv
steenwijker_courant_1934.tsv
steenwijker_courant_1920.tsv


 70%|███████   | 54/77 [00:04<00:02, 10.67it/s]

steenwijker_courant_1908.tsv
steenwijker_courant_1883.tsv


 75%|███████▌  | 58/77 [00:04<00:01, 10.03it/s]

steenwijker_courant_1897.tsv
steenwijker_courant_1878.tsv
steenwijker_courant_1893.tsv


 78%|███████▊  | 60/77 [00:04<00:01, 10.52it/s]

steenwijker_courant_1887.tsv
steenwijker_courant_1924.tsv
steenwijker_courant_1930.tsv


 81%|████████  | 62/77 [00:04<00:01, 11.18it/s]

steenwijker_courant_1918.tsv
steenwijker_courant_1919.tsv


 86%|████████▌ | 66/77 [00:05<00:01, 10.70it/s]

steenwijker_courant_1931.tsv
steenwijker_courant_1925.tsv
steenwijker_courant_1886.tsv
steenwijker_courant_1892.tsv


 91%|█████████ | 70/77 [00:05<00:00, 11.92it/s]

steenwijker_courant_1879.tsv
steenwijker_courant_1884.tsv
steenwijker_courant_1890.tsv


 94%|█████████▎| 72/77 [00:05<00:00, 12.16it/s]

steenwijker_courant_1933.tsv
steenwijker_courant_1927.tsv


 96%|█████████▌| 74/77 [00:05<00:00, 12.22it/s]

steenwijker_courant_1926.tsv
steenwijker_courant_1932.tsv
steenwijker_courant_1891.tsv


100%|██████████| 77/77 [00:06<00:00, 12.43it/s]
  0%|          | 0/51 [00:00<?, ?it/s]

steenwijker_courant_1885.tsv
../../../../Dropbox/datasets/newspapers_clean/parool/ads
parool_ads_1957.tsv


  2%|▏         | 1/51 [00:01<00:54,  1.08s/it]

parool_ads_1980.tsv


  4%|▍         | 2/51 [00:02<00:53,  1.10s/it]

parool_ads_1994.tsv


  6%|▌         | 3/51 [00:03<00:52,  1.08s/it]

parool_ads_1995.tsv


  8%|▊         | 4/51 [00:04<00:50,  1.08s/it]

parool_ads_1981.tsv


 10%|▉         | 5/51 [00:05<00:47,  1.04s/it]

parool_ads_1956.tsv


 12%|█▏        | 6/51 [00:06<00:46,  1.03s/it]

parool_ads_1968.tsv


 14%|█▎        | 7/51 [00:07<00:52,  1.20s/it]

parool_ads_1954.tsv


 16%|█▌        | 8/51 [00:08<00:46,  1.09s/it]

parool_ads_1983.tsv


 18%|█▊        | 9/51 [00:09<00:41,  1.01it/s]

parool_ads_1982.tsv


 20%|█▉        | 10/51 [00:10<00:38,  1.07it/s]

parool_ads_1955.tsv


 22%|██▏       | 11/51 [00:11<00:36,  1.08it/s]

parool_ads_1969.tsv


 24%|██▎       | 12/51 [00:12<00:43,  1.11s/it]

parool_ads_1951.tsv


 25%|██▌       | 13/51 [00:13<00:36,  1.03it/s]

parool_ads_1945.tsv
parool_ads_1979.tsv


 29%|██▉       | 15/51 [00:14<00:30,  1.17it/s]

parool_ads_1992.tsv


 31%|███▏      | 16/51 [00:15<00:29,  1.17it/s]

parool_ads_1986.tsv


 33%|███▎      | 17/51 [00:16<00:31,  1.07it/s]

parool_ads_1987.tsv


 35%|███▌      | 18/51 [00:17<00:31,  1.05it/s]

parool_ads_1993.tsv


 37%|███▋      | 19/51 [00:18<00:31,  1.03it/s]

parool_ads_1978.tsv


 39%|███▉      | 20/51 [00:19<00:32,  1.05s/it]

parool_ads_1950.tsv


 41%|████      | 21/51 [00:20<00:27,  1.08it/s]

parool_ads_1946.tsv


 43%|████▎     | 22/51 [00:20<00:20,  1.40it/s]

parool_ads_1952.tsv


 45%|████▌     | 23/51 [00:21<00:19,  1.43it/s]

parool_ads_1985.tsv


 47%|████▋     | 24/51 [00:22<00:21,  1.23it/s]

parool_ads_1991.tsv


 49%|████▉     | 25/51 [00:23<00:23,  1.13it/s]

parool_ads_1990.tsv


 51%|█████     | 26/51 [00:24<00:25,  1.02s/it]

parool_ads_1984.tsv


 53%|█████▎    | 27/51 [00:25<00:24,  1.02s/it]

parool_ads_1953.tsv


 55%|█████▍    | 28/51 [00:26<00:21,  1.09it/s]

parool_ads_1947.tsv


 57%|█████▋    | 29/51 [00:26<00:17,  1.28it/s]

parool_ads_1962.tsv


 59%|█████▉    | 30/51 [00:28<00:18,  1.12it/s]

parool_ads_1976.tsv


 61%|██████    | 31/51 [00:29<00:18,  1.09it/s]

parool_ads_1989.tsv


 63%|██████▎   | 32/51 [00:30<00:18,  1.01it/s]

parool_ads_1988.tsv


 65%|██████▍   | 33/51 [00:31<00:18,  1.03s/it]

parool_ads_1977.tsv


 67%|██████▋   | 34/51 [00:32<00:18,  1.07s/it]

parool_ads_1963.tsv


 69%|██████▊   | 35/51 [00:33<00:17,  1.11s/it]

parool_ads_1949.tsv


 71%|███████   | 36/51 [00:34<00:14,  1.07it/s]

parool_ads_1975.tsv


 73%|███████▎  | 37/51 [00:35<00:13,  1.00it/s]

parool_ads_1961.tsv


 75%|███████▍  | 38/51 [00:36<00:13,  1.05s/it]

parool_ads_1960.tsv


 76%|███████▋  | 39/51 [00:37<00:13,  1.09s/it]

parool_ads_1974.tsv


 78%|███████▊  | 40/51 [00:38<00:12,  1.10s/it]

parool_ads_1948.tsv


 80%|████████  | 41/51 [00:39<00:09,  1.10it/s]

parool_ads_1970.tsv


 82%|████████▏ | 42/51 [00:40<00:10,  1.14s/it]

parool_ads_1964.tsv


 84%|████████▍ | 43/51 [00:42<00:09,  1.24s/it]

parool_ads_1958.tsv


 86%|████████▋ | 44/51 [00:43<00:08,  1.16s/it]

parool_ads_1959.tsv


 88%|████████▊ | 45/51 [00:44<00:06,  1.13s/it]

parool_ads_1965.tsv


 90%|█████████ | 46/51 [00:45<00:05,  1.16s/it]

parool_ads_1971.tsv


 92%|█████████▏| 47/51 [00:47<00:05,  1.28s/it]

parool_ads_1967.tsv


 94%|█████████▍| 48/51 [00:48<00:04,  1.39s/it]

parool_ads_1973.tsv


 96%|█████████▌| 49/51 [00:50<00:02,  1.35s/it]

parool_ads_1972.tsv


 98%|█████████▊| 50/51 [00:51<00:01,  1.39s/it]

parool_ads_1966.tsv


100%|██████████| 51/51 [00:53<00:00,  1.04s/it]


In [37]:
# we remove ads with height or width smaller than 100 px
meta_data['newspaper_name'] = meta_data['newspaper_name'].str.lower()
meta_data = meta_data[(meta_data['h'] >= 100) & (meta_data['w'] >= 100)] 
meta_data.to_csv(os.path.join(output_path,'full_metadata.csv'), index=False)

In [38]:
meta_data.columns

Index(['date', 'page', 'n_pages', 'w', 'h', 'identifier', 'string_length',
       'size', 'character_proportion', 'newspaper_name'],
      dtype='object')