# Linear Regression model of employees attrition

In [2]:
from pyspark.sql import SparkSession
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Initialize a Spark session
spark = SparkSession.builder \
    .appName("HR Analytics") \
    .getOrCreate()

# File location and type
file_location = r"C:\Users\HP\Downloads\Big Data Analytics\Data Science Projects\Data Science Internship\Project 3 - HR Analytics-20231111T083906Z-001\Project 3 - HR Analytics\Data P3 MeriSKILL\HR-Employee-Attrition.csv"
file_type = "csv"

# Read the CSV file into a Spark DataFrame
df = spark.read.format(file_type) \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load(file_location)

# Select the relevant columns
columns = ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome',
           'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction',
           'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
           'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime',
           'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
           'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
           'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

df = df.select(columns)

In [4]:
# Convert the Spark DataFrame to a Pandas DataFrame
pdf = df.toPandas()

# Encode categorical variables
le = LabelEncoder()
for column in ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']:
    pdf[column] = le.fit_transform(pdf[column])

# Split the data into features (X) and target (y)
X = pdf.drop(['MonthlyIncome'], axis=1)
y = pdf['MonthlyIncome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on the test set
y_pred = lr_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Mean Squared Error', 'R-squared'],
    'Value': [mse, r2]
})

In [6]:
# Display the metrics DataFrame
print(metrics_df)

# Display the coefficients
coefficients_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr_model.coef_
})

# Display the coefficients DataFrame
print("\nCoefficients:")
print(coefficients_df)

               Metric         Value
0  Mean Squared Error  2.146123e+06
1           R-squared  8.938893e-01

Coefficients:
                     Feature   Coefficient
0                        Age -8.034155e+00
1                  Attrition -8.223573e+00
2             BusinessTravel  9.690321e+01
3                  DailyRate  3.177521e-02
4                 Department -5.986722e+02
5           DistanceFromHome -1.170793e+01
6                  Education  2.762346e+00
7             EducationField -7.600703e-01
8              EmployeeCount -5.684342e-14
9             EmployeeNumber  6.847862e-02
10   EnvironmentSatisfaction -5.831251e+01
11                    Gender -7.688857e+01
12                HourlyRate  2.141767e+00
13            JobInvolvement -1.056614e+01
14                  JobLevel  3.950741e+03
15                   JobRole  7.482872e+01
16           JobSatisfaction -3.157529e+01
17             MaritalStatus  2.048638e+01
18               MonthlyRate -1.652952e-03
19        NumComp