In [13]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
# Load the dataset
df = pd.read_csv('salary_dataset.csv')

In [27]:
# 1. Handle missing values
# Step 1: Identify Outliers using the IQR Method

Q1 = df['salary'].quantile(0.25)
Q3 = df['salary'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds (1.5 * IQR is a common threshold for detecting outliers)
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Step 2: Remove rows with salary values outside the IQR bounds (outliers)
df = df[(df['salary'] >= lower_bound) & (df['salary'] <= upper_bound)]

# Impute numeric columns with the mean
numeric_cols = ['age', 'experience', 'salary']
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# Impute categorical columns with the most frequent value
categorical_cols = ['sex', 'education', 'designation']
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

               age   experience         salary
count  7392.000000  7392.000000    7392.000000
mean     33.624442     7.726444  111600.452279
std       7.248476     5.947425   56104.726040
min      21.000000     0.000000   20000.000000
25%      28.000000     3.000000   60000.000000
50%      33.000000     6.000000  110000.000000
75%      37.000000    11.000000  160000.000000
max      62.000000    34.000000  300000.000000


In [None]:
# 2. One-Hot Encoding for categorical variables
# We use OneHotEncoder to create binary columns for categorical features
encoder = OneHotEncoder(drop='first', sparse=False)  # drop='first' avoids multicollinearity
encoded_cols = pd.DataFrame(encoder.fit_transform(df[categorical_cols]), 
                            columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate encoded columns
df = df.drop(categorical_cols, axis=1)
df = pd.concat([df, encoded_cols], axis=1)

In [None]:
# 3. Scaling numeric columns (Standardization)
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [31]:
# Inspect the preprocessed data
df = df[df['salary'] > 10000]
print(df.describe())
df.to_csv("preprocessed_salary_dataset.csv", index=False)

               age   experience         salary
count  7392.000000  7392.000000    7392.000000
mean     33.624442     7.726444  111600.452279
std       7.248476     5.947425   56104.726040
min      21.000000     0.000000   20000.000000
25%      28.000000     3.000000   60000.000000
50%      33.000000     6.000000  110000.000000
75%      37.000000    11.000000  160000.000000
max      62.000000    34.000000  300000.000000


In [69]:
# # Step 2: Splitting the Dataset

# # Define X (features) and y (target)
# X = df.drop('salary', axis=1)  # Features: all columns except salary
# y = df['salary']  # Target: salary

# # Split the data into training and testing sets (80-20 split)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Check the shapes to ensure the split is correct
# X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# # Step 3: Linear Regression Model

# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score

# # 1. Initialize the Linear Regression model
# model = LinearRegression()

# # 2. Train the model
# model.fit(X_train, y_train)

# # 3. Make predictions on the test set
# y_pred = model.predict(X_test)

# # 4. Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"Mean Squared Error: {mse}")
# print(f"R-squared: {r2}")
