# Sample

Pull a sample of hyperlinks from the [News Homepages](https://homepages.news) archive for training.

In [16]:
import pathlib
import random
import pandas as pd

Get all of the hyperlinks JSON files in the input directory

In [4]:
input_path = pathlib.Path("") / "input"

In [7]:
file_list = list(input_path.glob("**/*.json"))

Get a sample of records from every file.

In [53]:
def sample_urls(path, n=10):
    """Sample a random number of URLs from the provided file path."""
    # Read the file
    df = pd.read_json(path)
    # Clean up text
    df['text'] = df.text.str.strip()
    # Drop links without text
    df = df[~(df.text == "")].copy()
    # Tack on handle
    df['handle'] = path.stem.split(".")[0]
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    # Get unique URL
    return df.sample(n=n)

In [54]:
df_list = [sample_urls(f) for f in file_list]

Merge them all together.

In [55]:
sample_df = pd.concat(df_list)

In [56]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1340 entries, 602 to 100
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1340 non-null   object
 1   url     1340 non-null   object
 2   handle  1340 non-null   object
dtypes: object(3)
memory usage: 41.9+ KB


In [57]:
sample_df.handle.value_counts()

harpers           10
berkeleyside      10
drudge            10
flintbeat         10
codaily           10
                  ..
bleacherreport    10
buckscoherald     10
discovermag       10
coindesk          10
gothamist         10
Name: handle, Length: 134, dtype: int64

In [58]:
sample_df.head()

Unnamed: 0,text,url,handle
602,March 2012 Issue,https://harpers.org/archive/2012/03/,harpers
561,[Story],story,harpers
16,Sections,/sections/,harpers
41,Subscribe Now,https://w1.buysub.com/servlet/OrdersGateway?cd...,harpers
36,The Fight to Choose,https://harpers.org/archive/2022/08/the-fight-...,harpers


Add a column for human reviewers to code each record.

In [59]:
sample_df['is_story'] = pd.NA

In [60]:
sample_df.to_csv(input_path / "sample.csv", index=False)