In [1]:
import pandas as pd
from collections import Counter

In [2]:
# Load data
df = pd.read_csv('ncbi_metadata.measurements_inferred_units_totals.csv')

In [3]:
# Function to filter digits and count tokens per row
def count_tokens(tokens):
    return Counter(token for token in tokens if not token.isdigit())

In [4]:
df['tokens'] = df['unit_name'].fillna('').astype(str).str.split()

In [5]:
# Apply per row
df['token_counts'] = df['tokens'].apply(count_tokens)

In [6]:
# Convert list of Counter dicts to DataFrame
df_bow = pd.DataFrame(df['token_counts'].tolist()).fillna(0).astype(int)

In [7]:
# Combine with original data if you want
df_combined = pd.concat([df.drop(columns=['tokens', 'token_counts']), df_bow], axis=1)


In [11]:
# Drop _id column
df_combined = df_combined.drop(columns=['_id'])

In [14]:
# Reorder columns: unit_name, total_count, then the rest sorted case-insensitively
first_columns = ['unit_name', 'total_count']
other_columns = sorted(
    [col for col in df_combined.columns if col not in first_columns],
    key=lambda s: s.lower()
)
df_case_insensitive_sorted = df_combined[first_columns + other_columns]


In [15]:
df_case_insensitive_sorted

Unnamed: 0,unit_name,total_count,5.30002e+06,acre,african,ampere,ampere-hour,ampere-turn,angle,arc,...,unk,volt,volt-ampere,watt,week,yard,year,yoctobarn,yoctometre,ångström
0,dimensionless,7738996,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,metre,475737,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,week,433473,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,centimetre,409285,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,day,274854,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,unk to the 58 second siemens,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2055,watt tesla,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2056,foot litre to the 5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2057,watt to the 4,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [16]:
df_case_insensitive_sorted.to_csv('ncbi_units_token_matrix.tsv', index=False, sep="\t")

In [19]:
unit_tokens = list(df_case_insensitive_sorted.columns)

In [20]:
with open("ncbi_units_tokens.txt", "w") as f:
    for line in unit_tokens:
        f.write(line + "\n")