In [7]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler

# 1. SETUP THE MATHEMATICAL PATTERNS
# We generate 1000 samples.
# n_features=10 corresponds to your 10 input columns (excluding Attrition).
# n_informative=8 means 8 of these columns strictly predict the outcome (High Quality Data).
X, y = make_classification(
    n_samples=3000, 
    n_features=10, 
    n_informative=8, 
    n_redundant=2, 
    n_clusters_per_class=1,
    random_state=42,
    flip_y=0.05 # Adds 5% "noise" so the model isn't 100% perfect (Realistic)
)

# 2. SCALERS: TRANSFORM RAW MATH INTO HR NUMBERS
# We use MinMaxScaler to force the generic sklearn numbers into specific HR ranges.
scaler_age = MinMaxScaler(feature_range=(23, 60))
scaler_income = MinMaxScaler(feature_range=(25000, 120000))
scaler_years = MinMaxScaler(feature_range=(0, 15))
scaler_promotions = MinMaxScaler(feature_range=(0, 4))
scaler_rating = MinMaxScaler(feature_range=(1, 4)) # Ratings 1 to 4

# 3. CONVERT FEATURES TO YOUR EXACT COLUMNS

# --- A. Numeric Columns (Direct Scaling) ---
# We take specific columns from X (0, 5, 6, 7, 9) and map them to your numeric fields
age = scaler_age.fit_transform(X[:, 0].reshape(-1, 1)).flatten().astype(int)
monthly_income = scaler_income.fit_transform(X[:, 5].reshape(-1, 1)).flatten().astype(int)
years_at_company = scaler_years.fit_transform(X[:, 6].reshape(-1, 1)).flatten().astype(int)
promotions = scaler_promotions.fit_transform(X[:, 7].reshape(-1, 1)).flatten().astype(int)
performance_rating = scaler_rating.fit_transform(X[:, 9].reshape(-1, 1)).flatten().astype(int)

# --- B. Categorical Columns (Binning) ---
# We use pd.cut to turn continuous numbers into categories (e.g., 0.1 -> 'Male', 0.9 -> 'Female')

# Feature 1: Gender (2 categories)
gender = pd.cut(X[:, 1], bins=2, labels=['Male', 'Female'])

# Feature 2: Education (3 categories)
education = pd.cut(X[:, 2], bins=3, labels=['Graduate', 'Post-Graduate', 'PhD'])

# Feature 3: Department (4 categories to match your example including Finance)
department = pd.cut(X[:, 3], bins=4, labels=['Sales', 'HR', 'IT', 'Finance'])

# Feature 4: Job Role (3 categories)
job_role = pd.cut(X[:, 4], bins=3, labels=['Executive', 'Lead', 'Manager'])

# Feature 8: Overtime (2 categories) - This is usually a strong predictor!
overtime = pd.cut(X[:, 8], bins=2, labels=['No', 'Yes'])

# 4. CREATE THE DATAFRAME
df = pd.DataFrame({
    'age': age,
    'gender': gender,
    'education': education,
    'department': department,
    'job_role': job_role,
    'monthly_income': monthly_income,
    'years_at_company': years_at_company,
    'promotions': promotions,
    'overtime': overtime,
    'performance_rating': performance_rating,
    'attrition': y # The target variable generated by Sklearn
})

# 5. SAVE TO CSV
df.to_csv('employee_attrition.csv', index=False)

print("✅ 'employee_attrition.csv' created successfully with 1000 rows.")
print("   The 'attrition' column is mathematically linked to the other columns.")
print("\n--- First 5 Rows ---")
print(df.head())

✅ 'employee_attrition.csv' created successfully with 1000 rows.
   The 'attrition' column is mathematically linked to the other columns.

--- First 5 Rows ---
   age gender      education department job_role  monthly_income  \
0   35   Male  Post-Graduate         IT     Lead           65994   
1   40   Male  Post-Graduate         IT     Lead           64492   
2   45   Male  Post-Graduate         IT  Manager           75776   
3   33   Male  Post-Graduate    Finance     Lead           67685   
4   31   Male  Post-Graduate         HR  Manager           45721   

   years_at_company  promotions overtime  performance_rating  attrition  
0                 9           1       No                   2          0  
1                 8           1       No                   2          1  
2                 8           1      Yes                   2          1  
3                 8           1       No                   2          0  
4                10           1      Yes                   2  

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   age                 3000 non-null   int64   
 1   gender              3000 non-null   category
 2   education           3000 non-null   category
 3   department          3000 non-null   category
 4   job_role            3000 non-null   category
 5   monthly_income      3000 non-null   int64   
 6   years_at_company    3000 non-null   int64   
 7   promotions          3000 non-null   int64   
 8   overtime            3000 non-null   category
 9   performance_rating  3000 non-null   int64   
 10  attrition           3000 non-null   int64   
dtypes: category(5), int64(6)
memory usage: 156.1 KB


In [9]:
df.duplicated().sum()

np.int64(0)