In [11]:
from tqdm import tqdm
import unidecode
import re

match_puncs_re = r"([.,!?()\-;\[\]+\\\/@:<>#_{}&%'*=" + r'"' + r"|])"
match_puncs_re = re.compile(match_puncs_re)

def jaccard(str1, str2):
    try:
        a = set(str1.lower().split()) 
        b = set(str2.lower().split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    except Exception as e:
        print(f'Error in jaccard: {str1}, {str2}')
        raise e
        
def text_cleaning_for_bert(text):
    # Keeps puncs, pads them with whitespaces
    text = text.replace('^', ' ')
    text = unidecode.unidecode(text)
    
    text = re.sub(r'\[[0-9]+]', ' SpecialReference ', text)
    
    # Remove years
    text = re.sub(r'(19|20)[0-9][0-9]', ' SpecialYear ', text)
    
    # remove other digits
    text = re.sub(r'\d+', ' ', text)
    
    # Remove websites
    text = ' '.join(['SpecialWebsite' if 'http' in t or 'www' in t else t for t in text.split(' ') ])

    text = match_puncs_re.sub(r' \1 ', text)

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text.strip()

with open('data/dataset_names.txt', 'r', encoding = 'utf-8') as f:
    names = f.readlines()
    names = [text_cleaning_for_bert(l) for l in names]

In [12]:
names[3000]

'AIRS / Aqua L Monthly Standard Physical Retrieval ( AIRS + AMSU ) degree x degree V ( AIRX STM ) at GES DISC'

In [32]:
# Get names that are:
# Longer than 3 tokens
# Shorter than 25 tokens (Some are too long)
# Not fully uppercase

def use_name(n):
    return len(n.split(' ')) > 3 and len(n.split(' ')) < 25 and not n.isupper()

In [33]:
unique_names = []

pbar = tqdm(names)
for name in pbar:
    if not use_name(name):
        continue
    
    has_similar = False
    for unique_name in unique_names:
        if jaccard(name, unique_name) >= 0.3:
            has_similar = True
            break

    if not has_similar:
        unique_names.append(name)
        
    pbar.set_description(f'unique_names: {len(unique_names)}')

unique_names: 20553: 100%|███████████████████████████████████████████████████| 275227/275227 [1:40:45<00:00, 45.52it/s]


In [31]:
unique_names[200:250]

['MODIS / Aqua Sea Ice Extent Daily L Global km EASE - Grid Day V',
 'Nimbus - Total Solar Irradiance Data in Native Format',
 'Solid State Energy Conversion for Deep Space Power',
 'Regolith Derived Heat Shield for a Planetary Body Entry and Descent System with In - Situ Fabrication',
 'Lightweight Passive Vaporizing Heat Sink , Phase II',
 'Development and Flight - testing of Astronomical Instrumentation for Future NASA Astrophysics Missions',
 'Autonomous Task Primitives for Complex Manipulation Operations',
 'DISCOVER - AQ P - B Aircraft Navigational and Meteorological Data',
 'Dead - Ended Passive Electrolyzer with Elimination of Vapor / Liquid Separation for Life Support Oxygen , Phase II',
 'Knowledge - Based System to Support Plug Load Management',
 'Assurance for Complex Systems',
 'BOREAS RSS - Level - b ASAS Image Data : At - sensor Radiance in BSQ Format',
 'Automated Discovery of Flight Track Anomalies',
 'Very High Gain and Low Noise Near Infrared Single Photon Counting D

In [35]:
import pickle

with open(f'data/gov_data_selected.pkl', 'wb') as f:
    pickle.dump(unique_names, f)