<a href="https://colab.research.google.com/github/msafwanktl/Project2025/blob/main/project2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

AI-Driven Job Market & Skill Demand Analyzer

In [1]:
import pandas as pd

df = pd.read_csv("/ds_salaries.csv")
df.job_title.value_counts().head(5)

Unnamed: 0_level_0,count
job_title,Unnamed: 1_level_1
Data Scientist,143
Data Engineer,132
Data Analyst,97
Machine Learning Engineer,41
Research Scientist,16


Pandas is used for Table contents

In [2]:
df = df.drop(columns=["Unnamed: 0"])
df.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

unwanted columns deleting

In [3]:
df.salary_in_usd.min(), df.salary_in_usd.mean(), df.salary_in_usd.max()

(2859, np.float64(112297.86985172982), 600000)

In [4]:
df.company_location.value_counts().head(10)

Unnamed: 0_level_0,count
company_location,Unnamed: 1_level_1
US,355
GB,47
CA,30
DE,28
IN,24
FR,15
ES,14
GR,11
JP,6
NL,4


In [5]:
df.groupby("job_title")["salary_in_usd"].mean().round(0).sort_values(ascending=False).head(10)

Unnamed: 0_level_0,salary_in_usd
job_title,Unnamed: 1_level_1
Data Analytics Lead,405000.0
Principal Data Engineer,328333.0
Financial Data Analyst,275000.0
Principal Data Scientist,215242.0
Director of Data Science,195074.0
Data Architect,177874.0
Applied Data Scientist,175655.0
Analytics Engineer,175000.0
Data Specialist,165000.0
Head of Data,160163.0


Average Salary Based on Top 10 Different Jobs in Ascending Order

In [6]:
df.groupby("experience_level")["salary_in_usd"].mean()

Unnamed: 0_level_0,salary_in_usd
experience_level,Unnamed: 1_level_1
EN,61643.318182
EX,199392.038462
MI,87996.056338
SE,138617.292857


Average Salary based on Experience Level

In [7]:
df.salary_in_usd.describe()

Unnamed: 0,salary_in_usd
count,607.0
mean,112297.869852
std,70957.259411
min,2859.0
25%,62726.0
50%,101570.0
75%,150000.0
max,600000.0


In [8]:
df.isnull().sum()

Unnamed: 0,0
work_year,0
experience_level,0
employment_type,0
job_title,0
salary,0
salary_currency,0
salary_in_usd,0
employee_residence,0
remote_ratio,0
company_location,0


Checking how many Null Values in every columns

In [9]:
df.select_dtypes(include="object").columns

Index(['experience_level', 'employment_type', 'job_title', 'salary_currency',
       'employee_residence', 'company_location', 'company_size'],
      dtype='object')

Select out the columns that have the catogery type or object type

In [10]:
experience_map = {
    "EN": "Entry",
    "MI": "Mid",
    "SE": "Senior",
    "EX": "Executive"
}

df["experience_level"] = df["experience_level"].map(experience_map)
df.experience_level

Unnamed: 0,experience_level
0,Mid
1,Senior
2,Senior
3,Mid
4,Senior
...,...
602,Senior
603,Senior
604,Senior
605,Senior


Cleaning Experience level Datas

In [11]:
df.experience_level.value_counts()

Unnamed: 0_level_0,count
experience_level,Unnamed: 1_level_1
Senior,280
Mid,213
Entry,88
Executive,26


In [12]:
employment_map = {
    "FT": "Full-Time",
    "PT": "Part-Time",
    "CT": "Contract",
    "FL": "Freelance"
}

df["employment_type"] = df["employment_type"].map(employment_map)

Cleaning Employment Type datas

In [13]:
df.employment_type.value_counts()

Unnamed: 0_level_0,count
employment_type,Unnamed: 1_level_1
Full-Time,588
Part-Time,10
Contract,5
Freelance,4


In [14]:
df["work_year"] = df["work_year"].astype(str)

In [15]:
type("work_year")

str

Converting the type of the column

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           607 non-null    object
 1   experience_level    607 non-null    object
 2   employment_type     607 non-null    object
 3   job_title           607 non-null    object
 4   salary              607 non-null    int64 
 5   salary_currency     607 non-null    object
 6   salary_in_usd       607 non-null    int64 
 7   employee_residence  607 non-null    object
 8   remote_ratio        607 non-null    int64 
 9   company_location    607 non-null    object
 10  company_size        607 non-null    object
dtypes: int64(3), object(8)
memory usage: 52.3+ KB


In [17]:
x = df[
    [
        "experience_level",
        "employment_type",
        "job_title",
        "company_location",
        "company_size",
        "remote_ratio",
        "work_year"
    ]
]

y = df["salary_in_usd"]

In [18]:
x.dtypes

Unnamed: 0,0
experience_level,object
employment_type,object
job_title,object
company_location,object
company_size,object
remote_ratio,int64
work_year,object


In [19]:
x_encoded = pd.get_dummies(x, drop_first = True)

In [20]:
x_encoded.shape

(607, 109)

In [21]:
x_encoded.dtypes

Unnamed: 0,0
remote_ratio,int64
experience_level_Executive,bool
experience_level_Mid,bool
experience_level_Senior,bool
employment_type_Freelance,bool
...,...
company_location_VN,bool
company_size_M,bool
company_size_S,bool
work_year_2021,bool


In [22]:
x_encoded.head()

Unnamed: 0,remote_ratio,experience_level_Executive,experience_level_Mid,experience_level_Senior,employment_type_Freelance,employment_type_Full-Time,employment_type_Part-Time,job_title_AI Scientist,job_title_Analytics Engineer,job_title_Applied Data Scientist,...,company_location_SG,company_location_SI,company_location_TR,company_location_UA,company_location_US,company_location_VN,company_size_M,company_size_S,work_year_2021,work_year_2022
0,0,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,False,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,50,False,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,0,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,50,False,False,True,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [23]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x_encoded,
    y,
    test_size=0.2,
    random_state=42
)

In [24]:
x_train.shape, x_test.shape

((485, 109), (122, 109))

In [25]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)

In [26]:
y_pred = model.predict(x_test)
y_pred[:5]

array([145637.7631664 , 145637.7631664 ,  73188.01969571, 186577.08659242,
       129974.46628183])

In [27]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

rmse, r2

(np.float64(46258.0565083062), 0.4416779728329029)

In [28]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(x_train, y_train)

In [29]:
y_pred_rf = rf_model.predict(x_test)
y_pred_rf[:5]

array([141523.82597463, 141523.82597463, 106718.44893939, 126607.80632035,
        97931.50569444])

In [30]:
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

rmse_rf, r2_rf

(np.float64(49345.347922591216), 0.3646655156332662)

In [31]:
import pandas as pd

feature_importance = pd.Series(
    rf_model.feature_importances_,
    index=x_encoded.columns
).sort_values(ascending=False)

feature_importance.head(10)

Unnamed: 0,0
company_location_US,0.295973
experience_level_Executive,0.082901
job_title_Principal Data Engineer,0.045562
experience_level_Senior,0.044371
job_title_Data Analyst,0.040474
remote_ratio,0.040255
company_size_S,0.03576
job_title_Machine Learning Scientist,0.030781
job_title_Financial Data Analyst,0.030507
job_title_Research Scientist,0.028568
