# BlogArticles

Download the following set of articles from here:
(HuggingFace  Articles)[https://huggingface.co/datasets/fabiochiu/medium-articles/blob/main/medium_articles.csv]

Unzip urls_preview_images_json_files.zip

In [None]:
import pandas as pd
import glob
import json

def read_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except json.JSONDecodeError as e:
        print(f"Error reading file {file_path}: {e}")

# Pattern to match your JSON files
file_pattern = 'urls_preview_images*.json'
files = glob.glob(file_pattern)

# List to hold each DataFrame
dfs = []
   
for file in files:
    json_data = read_json_file(file)
    df = pd.DataFrame(json_data)
    dfs.append(df)



# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# You now have a single DataFrame `combined_df` containing all the data


In [None]:
import pandas as pd

# Define the path to the CSV file
file_path = 'medium_articles.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Function to convert string representation of list to an actual list
def convert_string_to_list(string):
    return eval(string)

# Apply the function to the 'authors' and 'tags' columns
df['authors'] = df['authors'].apply(convert_string_to_list)
df['tags'] = df['tags'].apply(convert_string_to_list)

# Show the DataFrame to verify the changes
df.head()


In [None]:
import pandas as pd
import hashlib
from urllib.parse import urlparse
import re

# Assuming df is your DataFrame

# Function to create the MD5 hash
def create_md5_hash(url):
    return hashlib.md5(url.encode()).hexdigest()


def create_id(url, title, authors):
    
    id_str = f"{create_md5_hash(url)}"

    # Remove newlines and other control characters
    id_str = re.sub(r'[\r\n\t ]+', '+', id_str)

    # Ensure the id is not longer than 255 characters and convert to Unicode
    return id_str[:255]





# Update the 'id' and 'text' columns
df['id'] = df.apply(lambda row: create_id(row['url'], row['title'], row['authors']), axis=1)
df['text'] = df['text'].apply(lambda x: x[:20000].replace('endoftext', 'REDACTED')) ## bug




# Create 'noop_title' column
#df['noop_title'] = df['title'].apply(lambda x: x[:64])
#df['noop_description'] = df['text'].apply(lambda x: x[:128])

df['noop_title'] = df['title'].apply(lambda x: str(x)[:64] if pd.notnull(x) else x)
df['noop_description'] = df['text'].apply(lambda x: str(x)[:128] if pd.notnull(x) else x)
df['noop_url'] = df['url']
#df['noop_image_url'] = 'https://miro.medium.com/v2/resize:fit:1200/0*mFed_WBqkegFJXgx'
#df['noop_publishing_authors'] = df['authors'].apply(lambda x: str(x)[:65] if pd.notnull(x) else x)
df['noop_publishing_authors'] = df['authors'].apply(lambda x: ', '.join(x) if isinstance(x, list) and len(x) > 0 else '')
df['noop_timestamp'] = df['timestamp']

# Merge the main df with the combined_df
merged_df = pd.merge(df, combined_df[['url', 'preview_image']], on='url', how='left')

# Default image URL
default_image_url = 'https://miro.medium.com/v2/resize:fit:1200/0*mFed_WBqkegFJXgx'

# Update noop_image_url with preview_image, use default where preview_image is missing
merged_df['noop_image_url'] = merged_df['preview_image'].fillna(default_image_url)

# Drop the extra preview_image column after updating
merged_df.drop('preview_image', axis=1, inplace=True)

# Rename and select columns
merged_df.rename(columns={'tags': 'meta_tag', 'authors': 'meta_authors'}, inplace=True)
output_df = merged_df[['id', 'text', 'meta_tag', 'meta_authors', 'noop_title', 'noop_description', 'noop_url', 'noop_image_url', 'noop_publishing_authors', 'noop_timestamp']]

# Output the DataFrame in chunks of 10000 rows
chunk_size = 10000
for i in range(0, len(output_df), chunk_size):
    chunk_df = output_df.iloc[i:i+chunk_size]
    chunk_file_path = f'vantage_blog_chunk_{i//chunk_size}.parquet'
    chunk_df.to_parquet(chunk_file_path, index=False)
    print(f"Chunk {i//chunk_size} written to {chunk_file_path}")
