In [1]:
# Step 1: Import Libraries & Load Data
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
df = pd.read_csv(r'C:\Users\Ashita SHARMA\Desktop\Datascience Project\datasets\adoption 3.2.csv')

In [3]:
# 2. Initial Data Inspection
print(df.head())
df.info()
# Display data types and non-null counts
print(df.info())


    job_id              job_title  salary_usd salary_currency  \
0  AI00001  AI Research Scientist       90376             USD   
1  AI00002   AI Software Engineer       61895             USD   
2  AI00003          AI Specialist      152626             USD   
3  AI00004           NLP Engineer       80215             USD   
4  AI00005          AI Consultant       54624             EUR   

  experience_level employment_type company_location company_size  \
0               SE              CT            China            M   
1               EN              CT           Canada            M   
2               MI              FL      Switzerland            L   
3               SE              FL            India            M   
4               EN              PT           France            S   

  employee_residence  remote_ratio  \
0              China            50   
1            Ireland           100   
2        South Korea             0   
3              India            50   
4         

In [4]:
# Get descriptive statistics for numerical columns
print(df.describe())

          salary_usd  remote_ratio  years_experience  job_description_length  \
count   15000.000000  15000.000000      15000.000000            15000.000000   
mean   115348.965133     49.483333          6.253200             1503.314733   
std     60260.940438     40.812712          5.545768              576.127083   
min     32519.000000      0.000000          0.000000              500.000000   
25%     70179.750000      0.000000          2.000000             1003.750000   
50%     99705.000000     50.000000          5.000000             1512.000000   
75%    146408.500000    100.000000         10.000000             2000.000000   
max    399095.000000    100.000000         19.000000             2499.000000   

       benefits_score  
count    15000.000000  
mean         7.504273  
std          1.450870  
min          5.000000  
25%          6.200000  
50%          7.500000  
75%          8.800000  
max         10.000000  


In [5]:
# Check for missing values
print(df.isnull().sum())

job_id                    0
job_title                 0
salary_usd                0
salary_currency           0
experience_level          0
employment_type           0
company_location          0
company_size              0
employee_residence        0
remote_ratio              0
required_skills           0
education_required        0
years_experience          0
industry                  0
posting_date              0
application_deadline      0
job_description_length    0
benefits_score            0
company_name              0
dtype: int64


In [6]:
# Check for duplicate rows
print(df.duplicated().sum())

0


In [7]:

# Dropping rows with missing values
df.dropna(inplace=True)

In [8]:
# Removing duplicate rows
df.drop_duplicates(inplace=True)

In [9]:
# Step 4 & 5: Feature Engineering & Creating Features
print("\n--- Feature Engineering ---")
# The 'required_skills' column is a single string of comma-separated values.
# To analyze individual skills, we need to parse this string and count each skill's frequency.
# This is a critical feature engineering step for this dataset.
all_skills = [skill.strip() for sublist in df['required_skills'].str.split(',') for skill in sublist]
skill_counts = Counter(all_skills)
top_skills = pd.DataFrame(skill_counts.most_common(15), columns=['Skill', 'Count'])
print("Top 15 most frequent skills engineered from 'required_skills' column:")
print(top_skills)



--- Feature Engineering ---
Top 15 most frequent skills engineered from 'required_skills' column:
                 Skill  Count
0               Python   4450
1                  SQL   3407
2           TensorFlow   3022
3           Kubernetes   3009
4                Scala   2794
5              PyTorch   2777
6                Linux   2705
7                  Git   2631
8                 Java   2578
9                  GCP   2442
10              Hadoop   2419
11             Tableau   2341
12                   R   2311
13     Computer Vision   2284
14  Data Visualization   2270


In [10]:
# Step 7: EDA - Statistics Summary
print("\n--- Statistics Summary ---")
print(df.describe().T)
# We can infer that the average salary is around $116k USD, but with a large standard deviation,
# indicating high salary variance. The average years of experience required is ~7 years.


--- Statistics Summary ---
                          count           mean           std      min  \
salary_usd              15000.0  115348.965133  60260.940438  32519.0   
remote_ratio            15000.0      49.483333     40.812712      0.0   
years_experience        15000.0       6.253200      5.545768      0.0   
job_description_length  15000.0    1503.314733    576.127083    500.0   
benefits_score          15000.0       7.504273      1.450870      5.0   

                             25%      50%       75%       max  
salary_usd              70179.75  99705.0  146408.5  399095.0  
remote_ratio                0.00     50.0     100.0     100.0  
years_experience            2.00      5.0      10.0      19.0  
job_description_length   1003.75   1512.0    2000.0    2499.0  
benefits_score              6.20      7.5       8.8      10.0  


In [11]:
# Step 9: EDA - Bivariate Analysis
print("\n--- Bivariate Analysis ---")

# Chart 1: Top 15 Most In-Demand AI Skills (Interactive)
# This horizontal bar chart is the best way to visualize the skills data we engineered.
fig1 = px.bar(top_skills,
              x='Count',
              y='Skill',
              orientation='h',
              title='Top 15 Most In-Demand AI Skills in Job Postings',
              labels={'x': 'Number of Job Postings', 'y': 'Skill'})
fig1.update_layout(yaxis={'categoryorder':'total ascending'})
fig1.show()


--- Bivariate Analysis ---


In [12]:
# Step 10: EDA - Multivariate Analysis
print("\n--- Multivariate Analysis ---")

# Chart 3: Geographic Distribution of AI Jobs (Interactive Map)
# A choropleth map is a powerful and intuitive way to show the global distribution of job opportunities.
top_locations = df['company_location'].value_counts().reset_index()
top_locations.columns = ['Country', 'Job Count']
fig3 = px.choropleth(top_locations,
                     locations="Country",
                     locationmode="country names",
                     color="Job Count",
                     hover_name="Country",
                     color_continuous_scale=px.colors.sequential.PuBu,
                     title="Global Distribution of AI Job Postings")
fig3.show()


--- Multivariate Analysis ---
