In [7]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv('wages_2023_original.csv', low_memory=False)

# Remove unnecessary columns
columns_to_drop = [
    'NOC_TITLE_FRA',
    'Nom_RE',
    'Data_Source_F',
    'Wage_Comment_F',
]

df_cleaned = df.drop(columns=columns_to_drop)

# Rename columns for clarity
df_cleaned = df_cleaned.rename(columns={
    'NOC_CNP': 'noc_code',
    'NOC_TITLE_ENG': 'job_title',
    'prov': 'province',
    'ER_Code_Code_RE': 'region_code',
    'ER_Name': 'region_name',
    'Low_Wage_Salaire_Minium': 'wage_low',
    'Median_Wage_Salaire_Median': 'wage_median',
    'High_Wage_Salaire_Maximal': 'wage_high',
    'Average_Wage_Salaire_Moyen': 'wage_avg',
    'Reference_Period': 'ref_period',
    'Revision_Date_Date_revision': 'revision_date',
    'Annual_Wage_Flag_Salaire_annuel': 'is_annual',
    'Wage_Comment_E': 'wage_comment'
})

# Remove rows with missing wage_low and province
df_cleaned = df_cleaned.dropna(subset=['wage_low'])

# Remove 'NOC_' prefix from noc_code column
df_cleaned['noc_code'] = df_cleaned['noc_code'].str.replace('NOC_', '')

# Display information about the cleaned dataset
print("Cleaned Dataset Info:")
print(df_cleaned.info())
print("\nFirst few rows:")
print(df_cleaned.head())

# Basic statistics for wage columns
print("\nWage Statistics:")
print(df_cleaned[['wage_low', 'wage_median', 'wage_high', 'wage_avg']].describe())

# Check missing values in cleaned dataset
print("\nMissing values:")
print(df_cleaned.isnull().sum())

# Save the cleaned dataset
df_cleaned.to_csv('Wages-2023.csv', index=False)
print("\nCleaned dataset saved as 'Wage-2023.csv'")

Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 19617 entries, 0 to 44369
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   noc_code       19617 non-null  object 
 1   job_title      19617 non-null  object 
 2   province       19102 non-null  object 
 3   region_code    19617 non-null  object 
 4   region_name    19617 non-null  object 
 5   wage_low       19617 non-null  float64
 6   wage_median    19617 non-null  float64
 7   wage_high      19610 non-null  float64
 8   wage_avg       18439 non-null  float64
 9   Data_Source_E  19617 non-null  object 
 10  ref_period     19617 non-null  object 
 11  revision_date  19617 non-null  object 
 12  is_annual      19617 non-null  int64  
 13  wage_comment   213 non-null    object 
dtypes: float64(4), int64(1), object(9)
memory usage: 2.2+ MB
None

First few rows:
   noc_code    job_title province region_code    region_name  wage_low  \
0     00010 