# Medium Articles

Download the following set of articles from here:
(HuggingFace Medium Articles)[https://huggingface.co/datasets/fabiochiu/medium-articles/blob/main/medium_articles.csv]

In [1]:
import pandas as pd

# Define the path to the CSV file
file_path = 'medium_articles.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Function to convert string representation of list to an actual list
def convert_string_to_list(string):
    return eval(string)

# Apply the function to the 'authors' and 'tags' columns
df['authors'] = df['authors'].apply(convert_string_to_list)
df['tags'] = df['tags'].apply(convert_string_to_list)

# Show the DataFrame to verify the changes
df.head()


Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,[Ryan Fan],2020-12-26 03:38:10.479000+00:00,"[Mental Health, Health, Psychology, Science, N..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,[Simon Spichak],2020-09-23 22:10:17.126000+00:00,"[Mental Health, Coronavirus, Science, Psycholo..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"[Biotechnology, Neuroscience, Brain, Wellness,..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,[Eshan Samaranayake],2020-12-21 16:05:19.524000+00:00,"[Health, Neuroscience, Mental Health, Psycholo..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,[Rishav Sinha],2020-02-26 00:01:01.576000+00:00,"[Brain, Health, Development, Psychology, Science]"


In [2]:
import pandas as pd
import hashlib
from urllib.parse import urlparse
import re

# Assuming df is your DataFrame

# Function to create the MD5 hash
def create_md5_hash(url):
    return hashlib.md5(url.encode()).hexdigest()


def create_id(url, title, authors):
    
    id_str = f"{create_md5_hash(url)}"

    # Remove newlines and other control characters
    id_str = re.sub(r'[\r\n\t ]+', '+', id_str)

    # Ensure the id is not longer than 255 characters and convert to Unicode
    return id_str[:255]


# Update the 'id' and 'text' columns
df['id'] = df.apply(lambda row: create_id(row['url'], row['title'], row['authors']), axis=1)
df['text'] = df['text'].apply(lambda x: x[:20000].replace('endoftext', 'REDACTED')) ## bug


# Create 'noop_title' column
#df['noop_title'] = df['title'].apply(lambda x: x[:64])
#df['noop_description'] = df['text'].apply(lambda x: x[:128])

df['noop_title'] = df['title'].apply(lambda x: str(x)[:64] if pd.notnull(x) else x)
df['noop_description'] = df['text'].apply(lambda x: str(x)[:128] if pd.notnull(x) else x)
df['noop_url'] = df['url']
df['noop_image_url'] = 'https://miro.medium.com/v2/resize:fit:1200/0*mFed_WBqkegFJXgx'
#df['noop_publishing_authors'] = df['authors'].apply(lambda x: str(x)[:65] if pd.notnull(x) else x)
df['noop_publishing_authors'] = df['authors'].apply(lambda x: ', '.join(x) if isinstance(x, list) and len(x) > 0 else '')
df['noop_timestamp'] = df['timestamp']

# Rename and select columns
df.rename(columns={'tags': 'meta_tag', 'authors': 'meta_authors'}, inplace=True)
output_df = df[['id', 'text', 'meta_tag', 'meta_authors', 'noop_title', 'noop_description', 'noop_url', 'noop_image_url', 'noop_publishing_authors', 'noop_timestamp']]

# Output the DataFrame in chunks of 10000 rows
chunk_size = 10000
for i in range(0, len(output_df), chunk_size):
    chunk_df = output_df.iloc[i:i+chunk_size]
    chunk_file_path = f'vantage_medium_chunk_{i//chunk_size}.parquet'
    chunk_df.to_parquet(chunk_file_path, index=False)
    print(f"Chunk {i//chunk_size} written to {chunk_file_path}")


Chunk 0 written to vantage_medium_chunk_0.parquet
Chunk 1 written to vantage_medium_chunk_1.parquet
Chunk 2 written to vantage_medium_chunk_2.parquet
Chunk 3 written to vantage_medium_chunk_3.parquet
Chunk 4 written to vantage_medium_chunk_4.parquet
Chunk 5 written to vantage_medium_chunk_5.parquet
Chunk 6 written to vantage_medium_chunk_6.parquet
Chunk 7 written to vantage_medium_chunk_7.parquet
Chunk 8 written to vantage_medium_chunk_8.parquet
Chunk 9 written to vantage_medium_chunk_9.parquet
Chunk 10 written to vantage_medium_chunk_10.parquet
Chunk 11 written to vantage_medium_chunk_11.parquet
Chunk 12 written to vantage_medium_chunk_12.parquet
Chunk 13 written to vantage_medium_chunk_13.parquet
Chunk 14 written to vantage_medium_chunk_14.parquet
Chunk 15 written to vantage_medium_chunk_15.parquet
Chunk 16 written to vantage_medium_chunk_16.parquet
Chunk 17 written to vantage_medium_chunk_17.parquet
Chunk 18 written to vantage_medium_chunk_18.parquet
Chunk 19 written to vantage_medi