<a href="https://colab.research.google.com/github/paranthaman76/ai-job-dataset/blob/main/Data_Preprocessing_Using_Numpy_and_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Load the dataset for Pandas operations
df_pandas = pd.read_csv('/content/ai_job_dataset.csv')

print("--- Pandas Data Preprocessing Techniques ---")

# 1. Categorical Encoding: One-Hot Encoding
# We'll apply One-Hot Encoding to 'experience_level' and 'employment_type'.
# This converts categorical text data into numerical format, creating new binary columns for each category.
print("\n1. Categorical Encoding (One-Hot Encoding) using Pandas:")
print("Original unique values for 'experience_level':", df_pandas['experience_level'].unique())
print("Original unique values for 'employment_type':", df_pandas['employment_type'].unique())

df_pandas = pd.get_dummies(df_pandas, columns=['experience_level', 'employment_type'], drop_first=True)
# drop_first=True prevents multicollinearity by dropping the first category of each feature.

print("\nDataFrame head after One-Hot Encoding (showing all columns including newly created dummy columns):")
# FIX: Instead of hardcoding dummy column names, print the head of the DataFrame
# This will show all columns, including the new dummy columns that were actually created.
print(df_pandas.head())
print("\nExplanation: `pd.get_dummies()` converts categorical columns into numerical (binary) columns. Each unique category value gets a new column, with 1 if the row belongs to that category and 0 otherwise. `drop_first=True` avoids multicollinearity by dropping the first dummy variable created for each original categorical column, which can vary based on alphabetical order of unique values.")

# 2. Feature Scaling: Min-Max Scaling
# We'll apply Min-Max Scaling to 'salary_usd' and 'years_experience'.
# Min-Max Scaling transforms features to a given range (e.g., 0 to 1).
# Formula: X_scaled = (X - X_min) / (X_max - X_min)
print("\n2. Feature Scaling (Min-Max Scaling) using Pandas:")
numerical_cols_to_scale_pandas = ['salary_usd', 'years_experience']

for col in numerical_cols_to_scale_pandas:
    min_val = df_pandas[col].min()
    max_val = df_pandas[col].max()
    df_pandas[col + '_scaled'] = (df_pandas[col] - min_val) / (max_val - min_val)

print("\nDataFrame head with scaled 'salary_usd' and 'years_experience' columns:")
print(df_pandas[['salary_usd', 'salary_usd_scaled', 'years_experience', 'years_experience_scaled']].head())
print("\nExplanation: Min-Max Scaling transforms numerical features to a specific range (e.g., [0, 1]) by subtracting the minimum value and dividing by the range (max - min).")


df_numpy = pd.read_csv('ai_job_dataset.csv')
numerical_cols_numpy = ['salary_usd', 'years_experience', 'remote_ratio']
numpy_data = df_numpy[numerical_cols_numpy].values

print("\n--- NumPy Data Preprocessing Techniques ---")

# 1. Feature Scaling (Min-Max Scaling) using NumPy
print("\n1. Feature Scaling (Min-Max Scaling) using NumPy:")

# Get the indices for the columns we want to scale
salary_usd_idx = numerical_cols_numpy.index('salary_usd')
years_experience_idx = numerical_cols_numpy.index('years_experience')

# Original data for comparison
print("Original 'salary_usd' (first 5 values):", df_numpy['salary_usd'].values[:5])
print("Original 'years_experience' (first 5 values):", df_numpy['years_experience'].values[:5])

# Create a copy of numpy_data and ensure it's float type for scaling calculations
numpy_data_scaled = numpy_data.copy().astype(np.float64) # Convert to float64 here


# Apply Min-Max scaling
min_salary = np.min(numpy_data_scaled[:, salary_usd_idx])
max_salary = np.max(numpy_data_scaled[:, salary_usd_idx])
numpy_data_scaled[:, salary_usd_idx] = (numpy_data_scaled[:, salary_usd_idx] - min_salary) / (max_salary - min_salary)

min_years = np.min(numpy_data_scaled[:, years_experience_idx])
max_years = np.max(numpy_data_scaled[:, years_experience_idx])
numpy_data_scaled[:, years_experience_idx] = (numpy_data_scaled[:, years_experience_idx] - min_years) / (max_years - min_years)

print("Scaled 'salary_usd' (first 5 values):")
print(numpy_data_scaled[:5, salary_usd_idx])
print("Scaled 'years_experience' (first 5 values):")
print(numpy_data_scaled[:5, years_experience_idx])
print("\nExplanation: Individual columns are selected using array slicing (`[:, column_index]`). The array is explicitly converted to float (`.astype(np.float64)`) to ensure floating-point division. The Min-Max scaling formula is then applied element-wise using NumPy's vectorized operations. A copy of the array is made to demonstrate the scaled result without altering the original DataFrame's underlying array values if it were used elsewhere.")

# 2. Data Type Conversion (on a numerical array)

print("\n2. Data Type Conversion (on numerical array) using Numpy:")
salary_usd_original_np = df_numpy['salary_usd'].values
print("Original dtype of 'salary_usd' column (NumPy array):", salary_usd_original_np.dtype)

# Convert to float32

salary_usd_float32 = salary_usd_original_np.astype(np.float32)
print("New dtype of 'salary_usd' column after conversion to float32:", salary_usd_float32.dtype)
print("Syntax: array.astype(dtype) - Creates a new array with the same data converted to the specified data type.")


#simple Numerical categorical Mapping (for 'remote_ratio')

print("\n3. Simple Numerical Categorical Mapping (for 'remote_ratio') using NumPy:")
remote_ratio_np = df_numpy['remote_ratio'].values # Get original for comparison
print("Original 'remote_ratio' (first 10 values):", remote_ratio_np[:10])

# Using np.where for conditional mapping

mapped_remote_ratios_np = np.where(remote_ratio_np == 0, 0,
                                   np.where(remote_ratio_np == 50, 1, 2))

print("Mapped 'remote_ratio' (first 10 values):", mapped_remote_ratios_np[:10])
print("\nExplanation: `np.where(condition, x, y)` returns elements chosen from `x` or `y` depending on `condition`. Nested `np.where` is used to create a simple mapping for discrete numerical categories. This is a basic form of numerical encoding within NumPy, suitable for a limited number of known discrete values.")

--- Pandas Data Preprocessing Techniques ---

1. Categorical Encoding (One-Hot Encoding) using Pandas:
Original unique values for 'experience_level': ['SE' 'EN' 'MI' 'EX']
Original unique values for 'employment_type': ['CT' 'FL' 'PT' 'FT']

DataFrame head after One-Hot Encoding (showing all columns including newly created dummy columns):
    job_id              job_title  salary_usd salary_currency  \
0  AI00001  AI Research Scientist       90376             USD   
1  AI00002   AI Software Engineer       61895             USD   
2  AI00003          AI Specialist      152626             USD   
3  AI00004           NLP Engineer       80215             USD   
4  AI00005          AI Consultant       54624             EUR   

  company_location company_size employee_residence  remote_ratio  \
0            China            M              China            50   
1           Canada            M            Ireland           100   
2      Switzerland            L        South Korea             0 