In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [2]:
# Initialize a Spark session
spark = SparkSession.builder \
    .appName("HR Analytics") \
    .getOrCreate()

# File location and type
file_location = r"C:\Users\HP\Downloads\Big Data Analytics\Data Science Projects\Data Science Internship\Project 3 - HR Analytics-20231111T083906Z-001\Project 3 - HR Analytics\Data P3 MeriSKILL\HR-Employee-Attrition.csv"
file_type = "csv"

# Read the CSV file into a Spark DataFrame
df = spark.read.format(file_type) \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load(file_location)

# Select the relevant columns
columns = ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome',
           'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction',
           'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
           'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime',
           'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
           'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
           'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

df = df.select(columns)

# Convert the Spark DataFrame to a Pandas DataFrame
pdf = df.toPandas()

In [6]:
# Encode categorical variables
le = LabelEncoder()
for column in ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']:
    pdf[column] = le.fit_transform(pdf[column])

# Split the data into features (X) and target (y)
X = pdf.drop(['Attrition'], axis=1)
y = pdf['Attrition']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=115000)
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)
# Convert classification report to DataFrame
report_df = pd.DataFrame(report).transpose()

# Create a DataFrame to store overall metrics
overall_metrics = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [accuracy, precision, recall, f1]
})

# Display the DataFrames
print("Overall Metrics:")
print(overall_metrics)

print("\nClassification Report:")
print(report_df)

print("\nConfusion Matrix:")
print(conf_matrix)

Overall Metrics:
      Metric     Value
0   Accuracy  0.866213
1  Precision  0.839805
2     Recall  0.866213
3   F1 Score  0.844194

Classification Report:
              precision    recall  f1-score     support
0              0.888620  0.965789  0.925599  380.000000
1              0.535714  0.245902  0.337079   61.000000
accuracy       0.866213  0.866213  0.866213    0.866213
macro avg      0.712167  0.605846  0.631339  441.000000
weighted avg   0.839805  0.866213  0.844194  441.000000

Confusion Matrix:
[[367  13]
 [ 46  15]]
