# Data extraction

Install packages

In [1]:
# PACKAGES
import time
import csv

# standard processing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# hugging face packages
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


Read in the words to filter out

In [2]:
with open('Keywords List - Short Version.csv', 'r') as f:
    reader = csv.reader(f)
    words_short = list(reader)
    
words_short
#TODO - turn this into the long version later

[['Anthropocene',
  'Carbon footprint',
  'Carbon-neutral',
  'Renewable Energy',
  'Greenwashing',
  'Zero-carbon',
  'Climate crisis',
  'Corporate Social Responsibility',
  'Paris Agreement',
  'Paris Climate Agreement',
  'Sea Level Rising',
  'Ocean Level Rising',
  'Carbon Capture and Storage',
  'Intergovernmental Panel on Climate Change (IPCC)',
  'Environmental Justice',
  'Anthropogenic emissions',
  'Climate Threshold',
  'Extreme Weather Event',
  'Carbon Sequestration',
  'Enhanced Greenhouse Effect',
  'Global Warming Potential',
  'Ecological Change',
  'Sea-Surface Temperature',
  'Climate Policy',
  'Fossil Fuels',
  'Nature Conservation']]

In [3]:
with open('Keywords List - Long Version.csv', 'r') as f:
    reader = csv.reader(f)
    words_long = list(reader)
    
words_long
#TODO - turn this into the long version later

[['warming',
  'temperature',
  'environment',
  'global warming',
  'climate change',
  'precipitation',
  'humidity',
  'environmental',
  'greenhouse gas',
  'biodiversity',
  'atmospheric',
  'meteorology',
  'rainfall',
  'biosphere',
  'landscape',
  'oceans',
  'economic',
  'pollution',
  'köppen climate classification',
  'energy',
  'wind',
  'arctic',
  'climatic',
  'tropical',
  'ecological',
  'climate',
  'atmospheric pressure',
  'temperate',
  'anthropogenic',
  'carbon emissions',
  'biome',
  'carbon emission',
  'hydrosphere',
  'paleoclimatology',
  'carbon sequestration',
  'ecosystem',
  'enviroment',
  'extinction',
  'meteorological',
  'ocean current',
  'climatologists',
  'reducing carbon emissions',
  'permafrost',
  'atmospheric carbon dioxide',
  'world meteorological organization',
  'subarctic',
  'sustainability',
  'marine ecosystems',
  'warmer temperatures',
  'greenhouse emissions',
  'carbon dioxide emissions',
  'climatologist',
  'climatic condi

Pull out the data from hugging face

In [4]:
def pull_data(dataset_name = "cerebras/SlimPajama-627B", n = 5499):
    """
    Args:
        dataset_name (string): name of the dataset to pull from
        n (int): number of samples to pull from the dataset (defaults to 5499 which is the maximum number of files)
        
    Returns:
        dataset (datasets.dataset_dict.DatasetDict): dictionary of datasets subsplits from huggingface
    
    """
    data_files = {str(i): f"test/chunk1/example_holdout_{i}.jsonl.zst" for i in range(n)}
    dataset = load_dataset(dataset_name, data_files=data_files) # load test dataset
    
    return dataset

In [25]:
start_pull = time.time()
datafiles_max = pull_data(n = 5499) # pull the files
end_pull = time.time()

print(f"Files pulled in {end_pull - start_pull} seconds.")

Downloading data:   0%|          | 0.00/22.7k [00:00<?, ?B/s]'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: afef008d-cd6d-4a4b-88f2-6291425d95d8)')' thrown while requesting GET https://huggingface.co/datasets/cerebras/SlimPajama-627B/resolve/2d0accdd58c5d5511943ca1f5ff0e3eb5e293543/test/chunk1/example_holdout_3909.jsonl.zst
Retrying in 1s [Retry 1/5].
Downloading data: 100%|██████████| 22.7k/22.7k [00:20<00:00, 1.12kB/s]
Downloading data: 100%|██████████| 45.8k/45.8k [00:00<00:00, 64.9kB/s]
Downloading data: 100%|██████████| 28.2k/28.2k [00:00<00:00, 43.9kB/s]
Downloading data: 100%|██████████| 21.4k/21.4k [00:00<00:00, 31.5kB/s]
Downloading data: 100%|██████████| 23.1k/23.1k [00:00<00:00, 34.0kB/s]
Downloading data: 100%|██████████| 18.9k/18.9k [00:00<00:00, 28.4kB/s]
Downloading data: 100%|██████████| 17.6k/17.6k [00:00<00:00, 31.0kB/s]
Downloading data: 100%|██████████| 38.8k/38.8k [00:00<00:00, 53.2kB/s]
Do

Files pulled in 5856.359065771103 seconds.


In [26]:
# save the datafiles to json file

# Initialize an empty list to hold the dataframes
dfs = []

# Loop over the datasets in the DatasetDict
for key in datafiles_max:
    df = datafiles_max[key].to_pandas()
    dfs.append(df)

# Concatenate all the dataframes
combined_df = pd.concat(dfs, ignore_index=True)

# Write the combined dataframe to a JSON file
combined_df.to_json("datafiles_max.json", orient='records', lines=True)

Filter the contents for related words - SHORT version

In [14]:
# function to filter out words in a dataset
def filter_dict(dataset, words):
    """
    Args:
        dataset (datasets.dataset_dict.DatasetDict, datasets.arrow_dataset.Dataset): dictionary of datasets subsplits from huggingface
        words (list): list of words to filter the dataset for
    """
    
    filtered_dataset = dataset.filter(lambda x: any(word in x['text'] for word in words)) # filter for any of the words in the list
    
    return filtered_dataset

In [27]:
start_filter = time.time()
filtered_these_short = filter_dict(datafiles_max, words_short[0]) # filter the files
end_filter = time.time()

print(f"Files filtered in {end_filter - start_filter}.")

Filter: 100%|██████████| 16/16 [00:00<00:00, 85.26 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 94.08 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 93.54 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 95.12 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 88.41 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 93.93 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 92.91 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 93.27 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 90.54 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 93.51 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 92.75 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 94.01 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 94.07 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 93.87 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 94.59 examples/s]
Filter: 100%|██████████| 16/16 [00:00<00:00, 94.67 exam

Files filtered in 6747.561989784241.


Filtering - LONG version

In [28]:
start_filter = time.time()
filtered_these_long = filter_dict(datafiles_max, words_long[0]) # filter the files
end_filter = time.time()

print(f"Files filtered in {end_filter - start_filter}.")

Filter: 100%|██████████| 16/16 [00:00<00:00, 48.44 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 53.86 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 52.98 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 54.12 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 51.16 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 54.03 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 54.54 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 53.48 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 45.10 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 51.35 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 53.46 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 52.16 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 41.07 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 50.43 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00:00, 53.94 examples/s] 
Filter: 100%|██████████| 16/16 [00:00<00

Files filtered in 161508.25065684319.


Save to JSON - short

In [29]:
# Initialize an empty list to hold the dataframes
dfs_short = []

# Loop over the slices
for i in range(5499):  # replace with the number of slices you have
    df = filtered_these_short[str(i)].to_pandas()
    dfs_short.append(df)

# Concatenate all the dataframes
combined_df_short = pd.concat(dfs_short, ignore_index=True)
combined_df_short.head()

# Write the combined dataframe to a JSON file
combined_df_short.to_json("combined_max_short.json", orient='records', lines=True)

Save to JSON - long

In [None]:
# Initialize an empty list to hold the dataframes
dfs_long = []

# Loop over the slices
for i in range(5499):  # replace with the number of slices you have
    df = filtered_these_long[str(i)].to_pandas()
    dfs_long.append(df)

# Concatenate all the dataframes
combined_df_long = pd.concat(dfs_long, ignore_index=True)
combined_df_long.head()

# Write the combined dataframe to a JSON file
combined_df_long.to_json("combined_max_long.json", orient='records', lines=True)

Pulled out 1000 first files in 10 minutes.

Filtered out the short list in 28 seconds and the long list in 58 seconds.

In [5]:
df = pd.read_json("combined_max_short.json", orient='records', lines=True)

# Display the first few rows of the dataframe
df.shape

(213, 2)