# Sample

Pull a sample of hyperlinks from the [News Homepages](https://homepages.news) archive for training.

In [23]:
import pathlib
import random
import pandas as pd

Get all of the hyperlinks JSON files in the input directory

In [24]:
input_path = pathlib.Path("") / "input"

In [25]:
file_list = list((input_path / "20220731-sample").glob("**/*.json"))

Get a sample of records from every file.

In [26]:
def sample_urls(path, n=10):
    """Sample a random number of URLs from the provided file path."""
    # Read the file
    df = pd.read_json(path)
    # Clean up text
    df['text'] = df.text.str.strip()
    # Drop links without text
    df = df[~(df.text == "")].copy()
    # Tack on handle
    df['handle'] = path.stem.split(".")[0]
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    # Get unique URL
    return df.sample(n=n)

In [27]:
df_list = [sample_urls(f) for f in file_list]

Merge them all together.

In [28]:
sample_df = pd.concat(df_list)

In [29]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1330 entries, 35 to 65
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1330 non-null   object
 1   url     1330 non-null   object
 2   handle  1330 non-null   object
dtypes: object(3)
memory usage: 41.6+ KB


In [30]:
sample_df.handle.value_counts()

kut              10
oaklandside      10
redstate         10
jstor_daily      10
pioneerpress     10
                 ..
indiancountry    10
lasvegassun      10
quillette        10
ketv             10
irishcathnews    10
Name: handle, Length: 133, dtype: int64

In [31]:
sample_df.head()

Unnamed: 0,text,url,handle
35,ATXplained,https://kutkutx.studio/category/atxplained,kut
247,"As interest rates rise, the 'American dream' o...",https://www.kut.org/business/2022-07-30/as-int...,kut
21,Life & Arts,https://www.kut.org/life-arts,kut
58,Sustaining Members,https://www.kut.org/sustaining-membership,kut
51,Become a Member,https://www.kut.org/support-kut,kut


In [32]:
sample_df.to_csv(input_path / "sample.csv", index=False)