# Data Issue Generation Notebook

### 1. Import Libraries and Load Dataset

In [1]:
# 1. Import libraries and load dataset
import pandas as pd
import numpy as np
import random
import os

# Load employee dataset
dataset_path = '../airflow/data/employee_data.csv'
df = pd.read_csv(dataset_path)
df_errors = df.copy()
df_errors.rename(columns={'PerformanceRating': 'actual_label'}, inplace=True)

df_errors.head()


Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,8410,31,Male,19,Education,5390,Excellent,Medium,Average,2,...,0,Mid,Medium,89,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534,Poor,High,Low,3,...,3,Mid,Medium,21,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159,Good,High,Low,0,...,3,Mid,Medium,74,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989,Good,High,High,1,...,2,Mid,Small,50,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821,Fair,Very High,Average,0,...,0,Senior,Medium,68,No,No,No,Fair,Medium,Stayed


### 1. Introduce Missing Values (in 'Age' and 'Blood Glucose Level')

In [2]:
missing_percentage = 0.1
n_missing = int(len(df_errors) * missing_percentage)

# Random missing values
missing_age_indices = random.sample(range(len(df_errors)), n_missing)
missing_income_indices = random.sample(range(len(df_errors)), n_missing)

df_errors.loc[missing_age_indices, 'Age'] = np.nan
df_errors.loc[missing_income_indices, 'MonthlyIncome'] = np.nan
df_errors.loc[missing_income_indices, 'Gender'] = np.nan


### 2. Introduce Unknown Values in 'Gender'

In [3]:
unknown_gender_indices = random.sample(range(len(df_errors)), 25)
df_errors.loc[unknown_gender_indices, 'Gender'] = 'Alien'


### 3. Introduce Wrong data Values for Features

In [4]:
wrong_age_indices = random.sample(range(len(df_errors)), 25)
wrong_income_indices = random.sample(range(len(df_errors)), 20)

df_errors.loc[wrong_age_indices, 'Age'] = -42
df_errors.loc[wrong_income_indices, 'MonthlyIncome'] = -10000


### 4. Introduce String in Numerical Columns

In [5]:
string_income_indices = random.sample(range(len(df_errors)), 25)
string_age_indices = random.sample(range(len(df_errors)), 25)

income_strings = ['TenK', 'FifteenK']
age_strings = ['Young', 'Old', 'Teen']

for idx, val in zip(string_income_indices, random.choices(income_strings, k=len(string_income_indices))):
    df_errors.loc[idx, 'MonthlyIncome'] = val

for idx, val in zip(string_age_indices, random.choices(age_strings, k=len(string_age_indices))):
    df_errors.loc[idx, 'Age'] = val


  df_errors.loc[idx, 'MonthlyIncome'] = val
  df_errors.loc[idx, 'Age'] = val


### 5. Introduce Outliers

In [6]:
outlier_indices = random.sample(range(len(df_errors)), 20)
df_errors.loc[outlier_indices, 'MonthlyIncome'] = random.choices([1_000_000, 999_999], k=len(outlier_indices))


### 6. Special Characters or Corrupted Data

In [7]:
special_char_indices = random.sample(range(len(df_errors)), 25)
corrupt_values = random.choices(['@#$', '*&%', '!!!', '###', '$$$'], k=len(special_char_indices))

df_errors.loc[special_char_indices, 'JobRole'] = corrupt_values

### 7. Random Duplicate rows

In [8]:
swap_indices = random.sample(range(len(df_errors)), 15)
df_errors.loc[swap_indices, ['Age', 'MonthlyIncome']] = df_errors.loc[swap_indices, ['MonthlyIncome', 'Age']].values


###  8. Introduce Missing Columns

In [9]:
part1 = df_errors.iloc[:500].drop(columns=['Gender'], errors='ignore')  # Drop critical column
part2 = df_errors.iloc[500:]


### ->Saving Error Files into Generated folder

In [12]:
output_dir = '../airflow/data/generated_errors'
os.makedirs(output_dir, exist_ok=True)

output_file1 = os.path.join(output_dir, 'employee_data_with_errors.csv')
output_file2 = os.path.join(output_dir, 'employee_data2_with_errors.csv')

# Remove columns that look like duplicates at the end
expected_columns = ['Employee ID', 'Age', 'Gender', 'Years at Company', 'Job Role', 'Monthly Income', 
                   'Work-Life Balance', 'Job Satisfaction', 'Performance Rating', 'Number of Promotions',
                   'Overtime', 'Distance from Home', 'Education Level', 'Marital Status', 
                   'Number of Dependents', 'Job Level', 'Company Size', 'Company Tenure', 'Remote Work',
                   'Leadership Opportunities', 'Innovation Opportunities', 'Company Reputation', 
                   'Employee Recognition', 'Attrition']

# For part1, Gender was intentionally dropped
part1_expected = [col for col in expected_columns if col != 'Gender']

# Clean part1 (keep only expected columns that exist)
part1_clean = part1[[col for col in part1_expected if col in part1.columns]]

# Clean part2 (keep only expected columns that exist)  
part2_clean = part2[[col for col in expected_columns if col in part2.columns]]

part1_clean.to_csv(output_file1, index=False, na_rep="NaN")
part2_clean.to_csv(output_file2, index=False, na_rep="NaN")

print(f"✅ Corrupted files saved:")
print(f"- {output_file1} - Columns: {list(part1_clean.columns)}")
print(f"- {output_file2} - Columns: {list(part2_clean.columns)}")

✅ Corrupted files saved:
- ../airflow/data/generated_errors\employee_data_with_errors.csv - Columns: ['Employee ID', 'Age', 'Years at Company', 'Job Role', 'Monthly Income', 'Work-Life Balance', 'Job Satisfaction', 'Performance Rating', 'Number of Promotions', 'Overtime', 'Distance from Home', 'Education Level', 'Marital Status', 'Number of Dependents', 'Job Level', 'Company Size', 'Company Tenure', 'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities', 'Company Reputation', 'Employee Recognition', 'Attrition']
- ../airflow/data/generated_errors\employee_data2_with_errors.csv - Columns: ['Employee ID', 'Age', 'Gender', 'Years at Company', 'Job Role', 'Monthly Income', 'Work-Life Balance', 'Job Satisfaction', 'Performance Rating', 'Number of Promotions', 'Overtime', 'Distance from Home', 'Education Level', 'Marital Status', 'Number of Dependents', 'Job Level', 'Company Size', 'Company Tenure', 'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities', 'Compan