### Environment Setup

We run this if using on Windows machine.
This will set the environment variables for PySpark to use the correct Python executable.

In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

### Build Spark Session

In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder \
    .master("local[1]") \
    .config("spark.rpc.askTimeout", "600s") \
    .config("spark.driver.network.timeout", "600s") \
    .config("spark.network.timeout", "600s") \
    .appName("Attrition Prediction") \
    .getOrCreate()

spark

### Data Load

In [3]:
url_train_data = 'data/train.csv'
url_test_data = 'data/test.csv'

raw_train_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(url_train_data)

raw_test_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(url_test_data)

### Dataset Exploration

In [4]:
# Total features (columns) of our dataset
raw_train_df.printSchema()

root
 |-- Employee ID: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Years at Company: integer (nullable = true)
 |-- Job Role: string (nullable = true)
 |-- Monthly Income: integer (nullable = true)
 |-- Work-Life Balance: string (nullable = true)
 |-- Job Satisfaction: string (nullable = true)
 |-- Performance Rating: string (nullable = true)
 |-- Number of Promotions: integer (nullable = true)
 |-- Overtime: string (nullable = true)
 |-- Distance from Home: integer (nullable = true)
 |-- Education Level: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Number of Dependents: integer (nullable = true)
 |-- Job Level: string (nullable = true)
 |-- Company Size: string (nullable = true)
 |-- Company Tenure: integer (nullable = true)
 |-- Remote Work: string (nullable = true)
 |-- Leadership Opportunities: string (nullable = true)
 |-- Innovation Opportunities: string (nullable = true)
 |-- Company Repu

In [5]:
# Total records (rows) of our dataset
print("Train dataset count:", raw_train_df.count())
print("Test dataset count:", raw_test_df.count())
print("Total dataset count:", raw_train_df.count() + raw_test_df.count())

Train dataset count: 59598
Test dataset count: 14900
Total dataset count: 74498


In [6]:
# Show the top 5 rows to see what kind of data we have and if we'll need to prepare it before building the model
raw_train_df.show(10)

+-----------+---+------+----------------+----------+--------------+-----------------+----------------+------------------+--------------------+--------+------------------+-----------------+--------------+--------------------+---------+------------+--------------+-----------+------------------------+------------------------+------------------+--------------------+---------+
|Employee ID|Age|Gender|Years at Company|  Job Role|Monthly Income|Work-Life Balance|Job Satisfaction|Performance Rating|Number of Promotions|Overtime|Distance from Home|  Education Level|Marital Status|Number of Dependents|Job Level|Company Size|Company Tenure|Remote Work|Leadership Opportunities|Innovation Opportunities|Company Reputation|Employee Recognition|Attrition|
+-----------+---+------+----------------+----------+--------------+-----------------+----------------+------------------+--------------------+--------+------------------+-----------------+--------------+--------------------+---------+------------+---

In [7]:
from pyspark.sql.functions import col
from pyspark.sql.functions import col, sum as _sum

# Count null values in each column of the train dataset
null_counts = raw_train_df.select(
    [_sum(col(c).isNull().cast("int")).alias(c + "_null_count") for c in raw_train_df.columns]
)

# Show the result
null_counts.show()

+----------------------+--------------+-----------------+---------------------------+-------------------+-------------------------+----------------------------+---------------------------+-----------------------------+-------------------------------+-------------------+-----------------------------+--------------------------+-------------------------+-------------------------------+--------------------+-----------------------+-------------------------+----------------------+-----------------------------------+-----------------------------------+-----------------------------+-------------------------------+--------------------+
|Employee ID_null_count|Age_null_count|Gender_null_count|Years at Company_null_count|Job Role_null_count|Monthly Income_null_count|Work-Life Balance_null_count|Job Satisfaction_null_count|Performance Rating_null_count|Number of Promotions_null_count|Overtime_null_count|Distance from Home_null_count|Education Level_null_count|Marital Status_null_count|Number of De

In [8]:
# Count null values in each column of the test dataset
null_counts = raw_test_df.select(
    [_sum(col(c).isNull().cast("int")).alias(c + "_null_count") for c in raw_test_df.columns]
)

# Show the result
null_counts.show()

+----------------------+--------------+-----------------+---------------------------+-------------------+-------------------------+----------------------------+---------------------------+-----------------------------+-------------------------------+-------------------+-----------------------------+--------------------------+-------------------------+-------------------------------+--------------------+-----------------------+-------------------------+----------------------+-----------------------------------+-----------------------------------+-----------------------------+-------------------------------+--------------------+
|Employee ID_null_count|Age_null_count|Gender_null_count|Years at Company_null_count|Job Role_null_count|Monthly Income_null_count|Work-Life Balance_null_count|Job Satisfaction_null_count|Performance Rating_null_count|Number of Promotions_null_count|Overtime_null_count|Distance from Home_null_count|Education Level_null_count|Marital Status_null_count|Number of De

### Dataset Preparation

In [9]:
from pyspark.ml.feature import StringIndexer, StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

columns_to_drop = [
    'Gender', 'Job Role', 'Work-Life Balance', 'Job Satisfaction', 'Performance Rating', 
    'Overtime', 'Education Level', 'Marital Status', 'Job Level', 'Company Size', 
    'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities', 'Company Reputation', 
    'Employee Recognition', 'Attrition'
]

numerical_columns = ['Age', 'Years at Company', 'Monthly Income', 'Number of Promotions', 
                      'Distance from Home', 'Number of Dependents', 'Company Tenure']

# Create a list of StringIndexer transformers
indexers = [
    StringIndexer(inputCol='Gender', outputCol='Gender Indx'),
    StringIndexer(inputCol='Job Role', outputCol='Job Role Indx'),
    StringIndexer(inputCol='Work-Life Balance', outputCol='Work-Life Balance Indx'),
    StringIndexer(inputCol='Job Satisfaction', outputCol='Job Satisfaction Indx'),
    StringIndexer(inputCol='Performance Rating', outputCol='Performance Rating Indx'),
    StringIndexer(inputCol='Overtime', outputCol='Overtime Indx'),
    StringIndexer(inputCol='Education Level', outputCol='Education Level Indx'),
    StringIndexer(inputCol='Marital Status', outputCol='Marital Status Indx'),
    StringIndexer(inputCol='Job Level', outputCol='Job Level Indx'),
    StringIndexer(inputCol='Company Size', outputCol='Company Size Indx'),
    StringIndexer(inputCol='Remote Work', outputCol='Remote Work Indx'),
    StringIndexer(inputCol='Leadership Opportunities', outputCol='Leadership Opportunities Indx'),
    StringIndexer(inputCol='Innovation Opportunities', outputCol='Innovation Opportunities Indx'),
    StringIndexer(inputCol='Company Reputation', outputCol='Company Reputation Indx'),
    StringIndexer(inputCol='Employee Recognition', outputCol='Employee Recognition Indx'),
    StringIndexer(inputCol='Attrition', outputCol='Attrition Indx')
]

# Assemble numerical features into a single vector column using VectorAssembler
assembler = VectorAssembler(inputCols=numerical_columns, outputCol='numerical_features')

# Apply standardization or scaling only on the numerical features
scaler = StandardScaler(inputCol='numerical_features', outputCol='scaled_numerical_features')

# Create a pipeline
pipeline = Pipeline(stages=indexers + [assembler, scaler])

# Fit and transform the data using the previously created pipeline for multiple indexes
train_df = pipeline.fit(raw_train_df).transform(raw_train_df)
test_df = pipeline.fit(raw_test_df).transform(raw_test_df)

# Replace the original columns with the indexed columns
for col_name in columns_to_drop:
    indexed_col = col_name + " Indx"
    train_df = train_df.drop(col_name).withColumnRenamed(indexed_col, col_name)
    test_df = test_df.drop(col_name).withColumnRenamed(indexed_col, col_name)

train_df.show(10)

+-----------+---+----------------+--------------+--------------------+------------------+--------------------+--------------+------+--------+-----------------+----------------+------------------+--------+---------------+--------------+---------+------------+-----------+------------------------+------------------------+------------------+--------------------+---------+--------------------+-------------------------+
|Employee ID|Age|Years at Company|Monthly Income|Number of Promotions|Distance from Home|Number of Dependents|Company Tenure|Gender|Job Role|Work-Life Balance|Job Satisfaction|Performance Rating|Overtime|Education Level|Marital Status|Job Level|Company Size|Remote Work|Leadership Opportunities|Innovation Opportunities|Company Reputation|Employee Recognition|Attrition|  numerical_features|scaled_numerical_features|
+-----------+---+----------------+--------------+--------------------+------------------+--------------------+--------------+------+--------+-----------------+-----

In [10]:
train_df.select('scaled_numerical_features').show(10, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------+
|scaled_numerical_features                                                                                                             |
+--------------------------------------------------------------------------------------------------------------------------------------+
|[2.566294627454518,1.6894924057870304,2.505278488141217,2.010068203249427,0.7728393618666017,0.0,3.502407874141631]                   |
|[4.884238161929567,0.355682611744638,2.5722098614793123,3.0151023048741408,0.7377102999635744,1.9284062249689407,0.8264108467075758]  |
|[1.986808743835756,0.8892065293615949,3.7923130212883467,0.0,0.38641968093330087,1.9284062249689407,2.9121144122076483]               |
|[2.9802131157536342,0.6224445705531165,1.8540920017059952,1.0050341016247135,0.9484846713817384,1.2856041499792938,1.9676448731132758]|
|[4.6358870689500975,3.6457467703825395,2

In [12]:
from pyspark.ml.linalg import Vectors

numerical_columns = ['Age Scaled', 'Years at Company Scaled', 'Monthly Income Scaled', 'Number of Promotions Scaled', 
                      'Distance from Home Scaled', 'Number of Dependents Scaled', 'Company Tenure Scaled']

#temp = train_df.repartition(10)
temp = train_df.select('scaled_numerical_features')
temp = temp.rdd.map(lambda x:[float(y) for y in x ['scaled_numerical_features']]).toDF(numerical_columns)

temp.show()


+------------------+-----------------------+---------------------+---------------------------+-------------------------+---------------------------+---------------------+
|        Age Scaled|Years at Company Scaled|Monthly Income Scaled|Number of Promotions Scaled|Distance from Home Scaled|Number of Dependents Scaled|Company Tenure Scaled|
+------------------+-----------------------+---------------------+---------------------------+-------------------------+---------------------------+---------------------+
| 2.566294627454518|     1.6894924057870304|    2.505278488141217|          2.010068203249427|       0.7728393618666017|                        0.0|    3.502407874141631|
| 4.884238161929567|      0.355682611744638|   2.5722098614793123|         3.0151023048741408|       0.7377102999635744|         1.9284062249689407|   0.8264108467075758|
| 1.986808743835756|     0.8892065293615949|   3.7923130212883467|                        0.0|      0.38641968093330087|         1.92840622496894

In [None]:
temp.count()

59598

In [14]:
for col_name in temp.columns:
    train_df = train_df.withColumn(col_name, temp[col_name])

AnalysisException: [MISSING_ATTRIBUTES.RESOLVED_ATTRIBUTE_MISSING_FROM_INPUT] Resolved attribute(s) "Age Scaled" missing from "Employee ID", "Age", "Years at Company", "Monthly Income", "Number of Promotions", "Distance from Home", "Number of Dependents", "Company Tenure", "Gender", "Job Role", "Work-Life Balance", "Job Satisfaction", "Performance Rating", "Overtime", "Education Level", "Marital Status", "Job Level", "Company Size", "Remote Work", "Leadership Opportunities", "Innovation Opportunities", "Company Reputation", "Employee Recognition", "Attrition", "numerical_features", "scaled_numerical_features" in operator !Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities#6331, Innovation Opportunities#6452, Company Reputation#6569, Employee Recognition#6682, Attrition#6791, ... 3 more fields]. ;
!Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities#6331, Innovation Opportunities#6452, Company Reputation#6569, Employee Recognition#6682, Attrition#6791, ... 3 more fields]
+- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities#6331, Innovation Opportunities#6452, Company Reputation#6569, Employee Recognition#6682, Attrition Indx#2678 AS Attrition#6791, ... 2 more fields]
   +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities#6331, Innovation Opportunities#6452, Company Reputation#6569, Employee Recognition#6682, Attrition Indx#2678, ... 2 more fields]
      +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities#6331, Innovation Opportunities#6452, Company Reputation#6569, Employee Recognition Indx#2632 AS Employee Recognition#6682, ... 3 more fields]
         +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities#6331, Innovation Opportunities#6452, Company Reputation#6569, Employee Recognition Indx#2632, ... 3 more fields]
            +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities#6331, Innovation Opportunities#6452, Company Reputation Indx#2587 AS Company Reputation#6569, ... 4 more fields]
               +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities#6331, Innovation Opportunities#6452, Company Reputation Indx#2587, ... 4 more fields]
                  +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities#6331, Innovation Opportunities Indx#2543 AS Innovation Opportunities#6452, ... 5 more fields]
                     +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities#6331, Innovation Opportunities Indx#2543, ... 5 more fields]
                        +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities Indx#2500 AS Leadership Opportunities#6331, ... 6 more fields]
                           +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work#6206, Leadership Opportunities Indx#2500, ... 6 more fields]
                              +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work Indx#2458 AS Remote Work#6206, ... 7 more fields]
                                 +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size#6077, Remote Work Indx#2458, ... 7 more fields]
                                    +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size Indx#2417 AS Company Size#6077, ... 8 more fields]
                                       +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level#5944, Company Size Indx#2417, ... 8 more fields]
                                          +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level Indx#2377 AS Job Level#5944, ... 9 more fields]
                                             +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status#5807, Job Level Indx#2377, ... 9 more fields]
                                                +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status Indx#2338 AS Marital Status#5807, ... 10 more fields]
                                                   +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level#5666, Marital Status Indx#2338, ... 10 more fields]
                                                      +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level Indx#2300 AS Education Level#5666, ... 11 more fields]
                                                         +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime#5521, Education Level Indx#2300, ... 11 more fields]
                                                            +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime Indx#2263 AS Overtime#5521, ... 12 more fields]
                                                               +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating#5372, Overtime Indx#2263, ... 12 more fields]
                                                                  +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating Indx#2227 AS Performance Rating#5372, ... 13 more fields]
                                                                     +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction#5219, Performance Rating Indx#2227, ... 13 more fields]
                                                                        +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction Indx#2192 AS Job Satisfaction#5219, ... 14 more fields]
                                                                           +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance#5062, Job Satisfaction Indx#2192, ... 14 more fields]
                                                                              +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance Indx#2158 AS Work-Life Balance#5062, ... 15 more fields]
                                                                                 +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role#4901, Work-Life Balance Indx#2158, ... 15 more fields]
                                                                                    +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role Indx#2125 AS Job Role#4901, ... 16 more fields]
                                                                                       +- Project [Employee ID#17, Age#18, Years at Company#20, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender#4736, Job Role Indx#2125, ... 16 more fields]
                                                                                          +- Project [Employee ID#17, Age#18, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender Indx#2093 AS Gender#4736, ... 17 more fields]
                                                                                             +- Project [Employee ID#17, Age#18, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, Gender Indx#2093, ... 17 more fields]
                                                                                                +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 18 more fields]
                                                                                                   +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 17 more fields]
                                                                                                      +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 16 more fields]
                                                                                                         +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 15 more fields]
                                                                                                            +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 14 more fields]
                                                                                                               +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 13 more fields]
                                                                                                                  +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 12 more fields]
                                                                                                                     +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 11 more fields]
                                                                                                                        +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 10 more fields]
                                                                                                                           +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 9 more fields]
                                                                                                                              +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 8 more fields]
                                                                                                                                 +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 7 more fields]
                                                                                                                                    +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 6 more fields]
                                                                                                                                       +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 5 more fields]
                                                                                                                                          +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 4 more fields]
                                                                                                                                             +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 3 more fields]
                                                                                                                                                +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, ... 2 more fields]
                                                                                                                                                   +- Project [Employee ID#17, Age#18, Gender#19, Years at Company#20, Job Role#21, Monthly Income#22, Work-Life Balance#23, Job Satisfaction#24, Performance Rating#25, Number of Promotions#26, Overtime#27, Distance from Home#28, Education Level#29, Marital Status#30, Number of Dependents#31, Job Level#32, Company Size#33, Company Tenure#34, Remote Work#35, Leadership Opportunities#36, Innovation Opportunities#37, Company Reputation#38, Employee Recognition#39, Attrition#40, UDF(cast(Gender#19 as string)) AS Gender Indx#2093]
                                                                                                                                                      +- Relation [Employee ID#17,Age#18,Gender#19,Years at Company#20,Job Role#21,Monthly Income#22,Work-Life Balance#23,Job Satisfaction#24,Performance Rating#25,Number of Promotions#26,Overtime#27,Distance from Home#28,Education Level#29,Marital Status#30,Number of Dependents#31,Job Level#32,Company Size#33,Company Tenure#34,Remote Work#35,Leadership Opportunities#36,Innovation Opportunities#37,Company Reputation#38,Employee Recognition#39,Attrition#40] csv


In [None]:
train_df.show(10)

In [None]:
columns_to_drop = ['Employee ID','Age', 'Years at Company', 'Monthly Income', 'Number of Promotions', 
                      'Distance from Home', 'Number of Dependents', 'Company Tenure', 'numerical_features', 'scaled_numerical_features']

for column in columns_to_drop:
    train_df = train_df.drop(column)

train_df.show(20)

In [None]:
columns_to_keep = [
    'Gender', 'Job Role', 'Work-Life Balance', 'Job Satisfaction', 'Performance Rating', 'Overtime',
    'Education Level', 'Marital Status', 'Job Level', 'Company Size', 'Remote Work', 'Leadership Opportunities',
    'Innovation Opportunities', 'Company Reputation', 'Employee Recognition', 'Attrition', 'Age Scaled', 
    'Years at Company Scaled', 'Monthly Income Scaled', 'Number of Promotions Scaled', 
    'Distance from Home Scaled', 'Number of Dependents Scaled', 'Company Tenure Scaled'
    ]

# Convert the final PySpark DataFrames (train/test) to Pandas
pandas_train_df = train_df.select(columns_to_keep)
pandas_train_df.toPandas()
#pandas_test_df = test_df.select(columns_to_keep).toPandas()

# Extract features and labels for train/test split
#X_train = pandas_train_df.drop('Attrition', axis=1)  # Assuming 'Attrition' is the target column
#y_train = pandas_train_df['Attrition']  # Target variable (assuming binary classification)

#X_test = pandas_test_df.drop('Attrition', axis=1)
#y_test = pandas_test_df['Attrition']

#print(X_train)

### Build and Train the Models

K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Initialize and train the KNN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Make predictions
#y_pred = knn.predict(X_test)

# Evaluate the model
#accuracy = accuracy_score(y_test, y_pred)
#print(f'Accuracy: {accuracy:.4f}')

Logistic Regression

In [None]:
# Initialize and train the Logistic Regression model

Decision Trees

In [None]:
# Initialize and train the Decision Trees model