In [4]:
import pandas as pd
import os

In [2]:
def load_csv(file_path):
    # Load the CSV file and replace NaN with 'N/A'
    return pd.read_csv(file_path).fillna('N/A')

First, handle normalized the columns for vietnamworks_jobs.csv due to the rearranged column order compared to the documentation.

In [8]:
def process_vietnamworks_data(input_file, output_file):
    # Load the CSV file
    df = pd.read_csv(input_file)
    
    # Add a new column 'Job Link' and move the URL to it
    if 'Source Platform' in df.columns:
        df['Job Link'] = df['Source Platform']
    
    # Update the 'Source Platform' column to 'Vietnamworks'
    df['Source Platform'] = 'Vietnamworks'
    
    # Save the updated DataFrame to a new CSV file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    df.to_csv(output_file, index=False)
    print(f"Vietnamworks data processed successfully and saved to '{output_file}'.")

# Define input and output file paths
input_file = '../../data/raw/vietnamworks_jobs_2024.csv'
output_file = '../../data/processed/processed_vietnamworks_jobs_2024.csv'
    
# Process the Vietnamworks data
process_vietnamworks_data(input_file, output_file)

Vietnamworks data processed successfully and saved to '../../data/processed/processed_vietnamworks_jobs_2024.csv'.


Handle merge data job lists into one file .csv

In [13]:
def standardize_columns(df):    
    # Reorder columns to the standard structure
    common_columns = [
        'Job Title', 'Role', 'Level', 'Years of Experience', 
        'Company', 'Location', 'Salary Range', 'Required Skills', 
        'Source Platform', 'Job Link'
    ]
    
    # Reindex and replace NaN with 'N/A' if any column doesn't exist in the DataFrame
    df = df.reindex(columns=common_columns).fillna('N/A')
    return df

def merge_csv_files(file_paths, output_path):
    # Load and standardize each file
    standardized_dfs = []
    
    for file_path in file_paths:
        if 'vietnamworks' in file_path.lower():
            df = load_csv(file_path)
            standardized_df = standardize_columns(df)
        elif 'careerviet' in file_path.lower():
            df = load_csv(file_path)
            standardized_df = standardize_columns(df)
        else:
            continue
        
        standardized_dfs.append(standardized_df)
    
    # Concatenate all standardized DataFrames
    merged_df = pd.concat(standardized_dfs, ignore_index=True)
    
    # Replace any remaining NaN with 'N/A'
    merged_df = merged_df.fillna('N/A')
    
    # Create the output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save the merged DataFrame to CSV
    merged_df.to_csv(output_path, index=False)
    print(f"Files merged successfully into '{output_path}'.")

file_paths = [
    '../../data/raw/careerviet_jobs_2024.csv',
    '../../data/processed/processed_vietnamworks_jobs_2024.csv'
]
# Define output path
output_path = '../../data/processed/IT_jobs_2024.csv'
    
# Merge files
merge_csv_files(file_paths, output_path)

Files merged successfully into '../../data/processed/IT_jobs_2024.csv'.


Filter the jobs with the skill unmatched to IT careers

In [15]:
# Path to the input file
input_path = '../../data/processed/IT_jobs_2024.csv'

# Load the data
df = load_csv(input_path)

# Remove rows where 'Required Skills' is 'N/A'
df_cleaned = df[df['Required Skills'] != 'N/A']

# Output path for the cleaned file
output_path = '../../data/processed/IT_jobs_2024_cleaned.csv'

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv(output_path, index=False)

print(f"Cleaned data has been saved to '{output_path}'.")


Cleaned data has been saved to '../../data/processed/IT_jobs_2024_cleaned.csv'.
