In [1]:
import os
import pandas as pd
import random

### Generate Dataset

In [5]:
import os
import pandas as pd
import random

def generate_employee_turnover_dataset(num_samples=1000):
    """
    Generate a synthetic dataset on employee turnover and save it to a CSV file.

    Parameters:
    - num_samples (int): The number of samples (employee records) to generate in the dataset.

    Returns:
    - pandas.DataFrame: A DataFrame containing the generated employee turnover data.

    The generated dataset includes the following columns:
    - EmployeeID: Unique identifier for each employee.
    - Age: Employee's age (randomly generated between 22 and 65).
    - Gender: Employee's gender (randomly selected from "Male" or "Female").
    - MaritalStatus: Employee's marital status (randomly selected from "Single", "Married", or "Divorced").
    - Education: Employee's education level (randomly selected from "High School", "Bachelor's", "Master's", or "PhD").
    - Department: Employee's department (randomly selected from predefined department names).
    - JobRole: Employee's job role (randomly selected from predefined job roles).
    - MonthlyIncome: Employee's monthly income (randomly generated within a specified range).
    - YearsAtCompany: Number of years the employee has been with the company (randomly generated).
    - TotalWorkingYears: Total number of years the employee has worked (randomly generated).
    - YearsInCurrentRole: Number of years the employee has been in their current role (randomly generated).
    - Attrition: Employee attrition status (randomly selected from "Yes" or "No").

    The dataset is saved to a CSV file named "employee_turnover_dataset.csv" in a folder named "data."
    """
    # Define possible values for categorical attributes
    departments = ["HR", "Finance", "IT", "Marketing", "Sales"]
    job_roles = ["Manager", "Developer", "Designer", "Analyst", "HR Specialist", "Sales Representative"]
    education_levels = ["High School", "Bachelor's", "Master's", "PhD"]
    marital_statuses = ["Single", "Married", "Divorced"]
    genders = ["Male", "Female"]

    # Check if the "data" folder exists; if not, create it
    if not os.path.exists("data"):
        os.makedirs("data")

    # Initialize an empty DataFrame
    data = pd.DataFrame(columns=["EmployeeID", "Age", "Gender", "MaritalStatus", "Education", 
                                 "Department", "JobRole", "MonthlyIncome", "YearsAtCompany", 
                                 "TotalWorkingYears", "YearsInCurrentRole", "Attrition"])

    # Generate random data for the dataset
    for i in range(num_samples):
        employee_id = i + 1
        age = random.randint(22, 65)
        gender = random.choice(genders)
        marital_status = random.choice(marital_statuses)
        education = random.choice(education_levels)
        department = random.choice(departments)
        job_role = random.choice(job_roles)
        monthly_income = random.randint(2500, 15000)
        years_at_company = random.randint(0, 30)
        total_working_years = random.randint(0, 40)
        years_in_current_role = random.randint(0, years_at_company)
        attrition = random.choice(["Yes", "No"])
        
        # Create a new row as a DataFrame and append it to the existing data
        new_row = pd.DataFrame({
            "EmployeeID": [employee_id],
            "Age": [age],
            "Gender": [gender],
            "MaritalStatus": [marital_status],
            "Education": [education],
            "Department": [department],
            "JobRole": [job_role],
            "MonthlyIncome": [monthly_income],
            "YearsAtCompany": [years_at_company],
            "TotalWorkingYears": [total_working_years],
            "YearsInCurrentRole": [years_in_current_role],
            "Attrition": [attrition]
        })
        data = pd.concat([data, new_row], ignore_index=True)

    # Save the dataset to a CSV file inside the "data" folder
    data.to_csv("data/employee_turnover_dataset.csv", index=False)

    return data

In [6]:
# Check if the "data" folder exists
if os.path.exists("data") and os.path.isdir("data"):
    # List files in the "data" folder
    files_in_data_folder = os.listdir("data")

    # Check if there's a CSV file in the folder
    csv_files = [file for file in files_in_data_folder if file.endswith(".csv")]

    if csv_files:
        print("CSV file(s) found in the 'data' folder:")
        for csv_file in csv_files:
            print(csv_file)
    else:
        print("No CSV files found in the 'data' folder.")
        generate_employee_turnover_dataset(num_samples=1000)
        print("New dataset has been generated.")
else:
    print("The 'data' folder does not exist.")
    os.makedirs("data")
    generate_employee_turnover_dataset(num_samples=1000)


No CSV files found in the 'data' folder.
New dataset has been generated.
