# Sample

Pull a sample of hyperlinks from the [News Homepages](https://homepages.news) archive for training.

In [1]:
import pathlib
import random
import pandas as pd

Get all of the hyperlinks JSON files in the input directory

In [2]:
input_path = pathlib.Path("") / "input"

In [3]:
file_list = list((input_path / "20220731-sample").glob("**/*.json"))

Get a sample of records from every file.

In [9]:
def sample_urls(path, n=10):
    """Sample a random number of URLs from the provided file path."""
    # Read the file
    df = pd.read_json(path)
    # Cut empties
    if len(df) < n:
        return df
    # Clean up text
    df['text'] = df.text.str.strip()
    # Drop links without text
    df = df[~(df.text == "")].copy()
    # Tack on handle
    df['handle'] = path.stem.split(".")[0]
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    # Get unique URL
    return df.sample(n=n)

In [10]:
df_list = [sample_urls(f) for f in file_list]

Merge them all together.

In [11]:
sample_df = pd.concat(df_list)

In [12]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1294 entries, 0 to 111
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1294 non-null   object
 1   url     1294 non-null   object
 2   handle  1290 non-null   object
dtypes: object(3)
memory usage: 40.4+ KB


In [13]:
sample_df.handle.value_counts()

powerlineus        10
thedispatch        10
wcfcourier         10
techreview         10
occrp              10
                   ..
rollcall           10
thedailybeast      10
voiceofsandiego    10
reviewjournal      10
sacbee_news        10
Name: handle, Length: 129, dtype: int64

In [14]:
sample_df.head()

Unnamed: 0,text,url,handle
0,ABOUT US,/about-us,
1,MEET THE TEAM,/meet-the-team,
2,CAREERS,https://boards.greenhouse.io/semafor,
3,PRIVACY,/privacy.html,
176,Privacy Policy,https://www.powerlineblog.com/privacy-policy,powerlineus


In [15]:
sample_df.to_csv(input_path / "sample.csv", index=False)