<a href="https://colab.research.google.com/github/md-shadab2955/SHADAB-DEMO/blob/main/Salary_Prediction_Intern_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Basic libraries for handling data
import pandas as pd
import numpy as np

# Libraries for ML model and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [3]:
# Loading the dataset
df = pd.read_csv("expected_ctc.csv")

# Display first few rows to understand the data
df.head()


Unnamed: 0,IDX,Applicant_ID,Total_Experience,Total_Experience_in_field_applied,Department,Role,Industry,Organization,Designation,Education,...,Curent_Location,Preferred_location,Current_CTC,Inhand_Offer,Last_Appraisal_Rating,No_Of_Companies_worked,Number_of_Publications,Certifications,International_degree_any,Expected_CTC
0,1,22753,0,0,,,,,,PG,...,Guwahati,Pune,0,N,,0,0,0,0,384551
1,2,51087,23,14,HR,Consultant,Analytics,H,HR,Doctorate,...,Bangalore,Nagpur,2702664,Y,Key_Performer,2,4,0,0,3783729
2,3,38413,21,12,Top Management,Consultant,Training,J,,Doctorate,...,Ahmedabad,Jaipur,2236661,Y,Key_Performer,5,3,0,0,3131325
3,4,11501,15,8,Banking,Financial Analyst,Aviation,F,HR,Doctorate,...,Kanpur,Kolkata,2100510,N,C,5,3,0,0,2608833
4,5,58941,10,5,Sales,Project Manager,Insurance,E,Medical Officer,Grad,...,Ahmedabad,Ahmedabad,1931644,N,C,2,3,0,0,2221390


In [4]:
# Checking the size of the dataset
df.shape


(25000, 29)

In [5]:
# Checking column types and missing values
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 29 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   IDX                                25000 non-null  int64  
 1   Applicant_ID                       25000 non-null  int64  
 2   Total_Experience                   25000 non-null  int64  
 3   Total_Experience_in_field_applied  25000 non-null  int64  
 4   Department                         22222 non-null  object 
 5   Role                               24037 non-null  object 
 6   Industry                           24092 non-null  object 
 7   Organization                       24092 non-null  object 
 8   Designation                        21871 non-null  object 
 9   Education                          25000 non-null  object 
 10  Graduation_Specialization          18820 non-null  object 
 11  University_Grad                    18820 non-null  obj

In [9]:
# Dropping ID columns safely (won't throw error if already removed)
df.drop(['IDX', 'Applicant_ID'], axis=1, inplace=True, errors='ignore')


In [11]:
# Handling missing values in a safe way
# Text columns -> 'Unknown'
# Numeric columns -> median

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna("Unknown")
    else:
        df[col] = df[col].fillna(df[col].median())


In [12]:
# Machine learning models cannot work with text values
# Converting categorical columns into numbers

le = LabelEncoder()

for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])


In [13]:
# X contains all input features
# y contains the salary column (target)
X = df.drop('Expected_CTC', axis=1)
y = df['Expected_CTC']


In [14]:
# Splitting data into training and testing sets
# 80% for training and 20% for testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [15]:
# Creating a simple baseline model
lr = LinearRegression()

# Training the model
lr.fit(X_train, y_train)

# Predicting salary on test data
y_pred_lr = lr.predict(X_test)

# Evaluating the model
print("Linear Regression Results")
print("MAE:", mean_absolute_error(y_test, y_pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("R2 Score:", r2_score(y_test, y_pred_lr))


Linear Regression Results
MAE: 120847.95376777586
RMSE: 157698.22676937762
R2 Score: 0.9816124460089043


In [16]:
# Using Random Forest as it handles non-linear data better
rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# Training the Random Forest model
rf.fit(X_train, y_train)

# Making predictions
y_pred_rf = rf.predict(X_test)

# Evaluating the final model
print("Random Forest Results")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("R2 Score:", r2_score(y_test, y_pred_rf))


Random Forest Results
MAE: 13055.92999
RMSE: 25327.38513397093
R2 Score: 0.9995257022659284
