In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from google.colab import files

USE_LOG_TARGET = False   # set True if salaries span many orders of magnitude (helps sometimes)
RANDOM_STATE = 42

uploaded = files.upload()
df = pd.read_csv("Salary_Data.csv")

# -----------------------
# BASIC CLEANING
# -----------------------
# remove empty trailing columns created by Excel
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# drop columns that are entirely empty (defensive)
df = df.dropna(axis=1, how='all')

# drop rows with any missing values (you can switch to imputation if needed)
df = df.dropna()

# Remove commas in Salary and convert to numeric (defensive)
df['Salary'] = df['Salary'].astype(str).str.replace(',', '').astype(float)

# Trim whitespace in string columns (defensive)
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].str.strip()


cat_cols = ['Gender', 'Education Level', 'Job Title']
num_cols = [c for c in df.columns if c not in cat_cols + ['Salary']]

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)

# -----------------------
# OPTIONAL: log-transform target to stabilize variance
# -----------------------
y = df['Salary'].copy()
if USE_LOG_TARGET:
    y = np.log1p(y)   # log(1 + salary)

X = df.drop('Salary', axis=1)

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
scaler = StandardScaler()

preprocessor = ColumnTransformer(transformers=[
    ('cat', ohe, cat_cols),
    ('num', scaler, num_cols)
], remainder='drop', sparse_threshold=0)

# Fit transform
X_pre = preprocessor.fit_transform(X)

# -----------------------
# TRAIN/TEST SPLIT
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_pre, y, test_size=0.2, random_state=RANDOM_STATE
)

input_dim = X_train.shape[1]

model = Sequential([
    Dense(128, activation='relu', input_dim=input_dim),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.15),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

es = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=100,
    batch_size=8,
    callbacks=[es],
    verbose=1
)

# -----------------------
# PLOT TRAINING LOSS
# -----------------------
plt.figure(figsize=(8,5))
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

# -----------------------
# EVALUATION ON TEST SET
# -----------------------
y_pred_test = model.predict(X_test).flatten()

# If we used log target, invert transform for error metrics and display
if USE_LOG_TARGET:
    y_test_disp = np.expm1(y_test)
    y_pred_disp = np.expm1(y_pred_test)
else:
    y_test_disp = y_test
    y_pred_disp = y_pred_test

mae = mean_absolute_error(y_test_disp, y_pred_disp)
mse = mean_squared_error(y_test_disp, y_pred_disp)
r2 = r2_score(y_test_disp, y_pred_disp)

print(f"\nTest MAE: {mae:.2f}")
print(f"Test MSE: {mse:.2f}")
print(f"Test R2 : {r2:.4f}")

# Show a few sample predictions vs actual
sample_n = min(10, len(y_test_disp))
sample_results = pd.DataFrame({
    'Actual': y_test_disp[:sample_n].values,
    'Predicted': np.round(y_pred_disp[:sample_n])
})
print("\n--- Sample Test Predictions ---")
print(sample_results.reset_index(drop=True))

# Scatter plot actual vs predicted
plt.figure(figsize=(6,6))
plt.scatter(y_test_disp, y_pred_disp, alpha=0.6)
plt.plot([y_test_disp.min(), y_test_disp.max()],
         [y_test_disp.min(), y_test_disp.max()], 'r--')
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary')
plt.title('Actual vs Predicted (Test)')
plt.grid(True)
plt.show()




In [None]:
# -----------------------
# USER INPUT PREDICTION (robust to unseen categories)
# -----------------------
print("\nEnter Employee Details for Prediction (type exactly as dataset categories):")
# read values - you can also replace input() with fixed test values for testing
age = float(input("Age: ").strip())
gender = input("Gender (e.g. Male/Female): ").strip()
edu = input("Education Level (e.g. Bachelor's/Master's/PhD): ").strip()
job = input("Job Title: ").strip()
exp = float(input("Years of Experience: ").strip())

user_df = pd.DataFrame([{
    'Gender': gender,
    'Education Level': edu,
    'Job Title': job,
    # include numeric columns - maintain same column names and order used earlier
    **{c: 0 for c in num_cols}  # initialize numeric columns
}])
# set the numeric features provided
# assume numeric columns include Age and Years of Experience; adjust keys if names differ
if 'Age' in num_cols:
    user_df['Age'] = age
else:
    user_df[num_cols[0]] = age  # fallback
if 'Years of Experience' in num_cols:
    user_df['Years of Experience'] = exp
else:
    if len(num_cols) > 1:
        user_df[num_cols[1]] = exp
    else:
        user_df[num_cols[0]] = exp

# Make sure column order and names match what preprocessor expects
# Transform using same preprocessor (handles unknown categories gracefully)
user_pre = preprocessor.transform(user_df)
user_pred = model.predict(user_pre).flatten()[0]

if USE_LOG_TARGET:
    user_pred = np.expm1(user_pred)

print(f"\nPredicted Salary for the entered employee: {round(user_pred)}")