In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv("adult.csv")
data.head()

In [None]:
# Clean and preprocess
data['workclass'].replace({'?': 'Others'}, inplace=True)
data['occupation'].replace({'?': 'Others'}, inplace=True)
data = data[data['workclass'] != 'Without-pay']
data = data[data['workclass'] != 'Never-worked']

# Outlier removal
data = data[(data['age'] <= 75) & (data['age'] >= 17)]
data = data[(data['educational-num'] <= 16) & (data['educational-num'] >= 5)]
data = data.drop(columns=['education'])  # Drop redundant feature


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data['workclass'] = encoder.fit_transform(data['workclass'])
data['marital-status'] = encoder.fit_transform(data['marital-status'])
data['occupation'] = encoder.fit_transform(data['occupation'])
data['relationship'] = encoder.fit_transform(data['relationship'])
data['race'] = encoder.fit_transform(data['race'])
data['gender'] = encoder.fit_transform(data['gender'])
data['native-country'] = encoder.fit_transform(data['native-country'])


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Features and target
x = data.drop(columns=['income'])
y = data['income']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train model
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier())
])
pipe.fit(X_train, y_train)

# Evaluate
y_pred = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


In [None]:
import joblib

# Save trained pipeline
joblib.dump(pipe, 'salary_model.pkl')


In [None]:
import gradio as gr
import joblib
import numpy as np

# Load model
model = joblib.load("salary_model.pkl")

def predict_salary(age, workclass, fnlwgt, marital_status, occupation,
                   relationship, race, gender, capital_gain, capital_loss,
                   hours_per_week, native_country, educational_num):
    input_data = np.array([[age, workclass, fnlwgt, marital_status, occupation,
                            relationship, race, gender, capital_gain, capital_loss,
                            hours_per_week, native_country, educational_num]])
    prediction = model.predict(input_data)[0]
    return "Income >50K" if prediction == 1 else "Income <=50K"

inputs = [
    gr.Number(label="Age"),
    gr.Number(label="Workclass (Encoded)"),
    gr.Number(label="FNLWGT"),
    gr.Number(label="Marital Status (Encoded)"),
    gr.Number(label="Occupation (Encoded)"),
    gr.Number(label="Relationship (Encoded)"),
    gr.Number(label="Race (Encoded)"),
    gr.Number(label="Gender (0=Female, 1=Male)"),
    gr.Number(label="Capital Gain"),
    gr.Number(label="Capital Loss"),
    gr.Number(label="Hours Per Week"),
    gr.Number(label="Native Country (Encoded)"),
    gr.Number(label="Educational Number")
]

gr.Interface(fn=predict_salary,
             inputs=inputs,
             outputs=gr.Text(label="Predicted Income"),
             title="Employee Salary Prediction").launch()
