# DAY-2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
path = "day1_cleaned.csv"
df = pd.read_csv(path)
print(df.head())
print(df.info())

                                      company  \
0                            MM Media Pvt Ltd   
1                          find live infotech   
2         Softtech Career Infosystem Pvt. Ltd   
3                      Onboard HRServices LLP   
4  Spire Technologies and Solutions Pvt. Ltd.   

                                           education   experience  \
0  UG: B.Tech/B.E. - Any Specialization PG:Any Po...    0 - 1 yrs   
1  UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...    0 - 0 yrs   
2  UG: Any Graduate - Any Specialization PG:Any P...    4 - 8 yrs   
3  UG: Any Graduate - Any Specialization PG:CA Do...  11 - 15 yrs   
4  UG: B.Tech/B.E. - Any Specialization PG:Any Po...    6 - 8 yrs   

                                   industry  \
0          Media / Entertainment / Internet   
1  Advertising / PR / MR / Event Management   
2           IT-Software / Software Services   
3    Banking / Financial Services / Broking   
4           IT-Software / Software Services   

       

In [3]:
df.rename(columns = {'joblocation_address': 'location',
                    'jobtitle': 'job_title',
                    'jobdescription': 'description',
                    'numberofpositions': 'num_positions',
                    'payrate': 'salary',
                    'postdate': 'date_posted',
                    'site_name': 'source',
                    'uniq_id': 'id'}, inplace=True)

## Let’s clean job titles step by step so they’re normalized and grouped properly.

In [4]:
df["job_title_cleaned"] = df["job_title"].str.lower()
df["job_title_Cleaned"] = df["job_title_cleaned"].str.replace(r'[^a-z0-9\s]', '', regex=True)
df["job_title_cleaned"] = df["job_title_cleaned"].str.strip()

In [6]:
data_scientist_jobs = df['job_title_cleaned'][df['job_title_cleaned'].str.startswith('data scientist', na=False)]
print(data_scientist_jobs.unique())


['data scientist / bangalore'
 'data scientist/vba programming/r programming/fmcg-cpg/bangalore'
 'data scientist - machine learning'
 'data scientist partners for start-up' 'data scientist'
 'data scientist-machine learning' 'data scientist big data ml'
 'data scientist (machine learning)' 'data scientist machine learning'
 'data scientist - bangalore' 'data scientist - telecom domain'
 'data scientist senior data scientist'
 'data scientist/algorithm model developer'
 'data scientist - machine learning/nlp'
 'data scientist - data modelling - forecasting'
 'data scientist (big data)' 'data scientist (bangalore)']


## Let’s automatically normalize all “Data Scientist” variants in your dataset. 
### We’ll do this in a systematic way

In [7]:
ds_titles_mask  = df["job_title_cleaned"].str.contains("data scientist", na=False)
ds_varitaions = df.loc[ds_titles_mask, "job_title_cleaned"].unique()
print("All 'Data scientist' variations: \n", ds_varitaions)

All 'Data scientist' variations: 
 ['lead data scientist' 'data scientist / bangalore'
 'data scientist/vba programming/r programming/fmcg-cpg/bangalore'
 'data scientist - machine learning'
 'data scientist partners for start-up' 'data scientist'
 'director - data scientist - nlp/r/sas' 'senior data scientist'
 'data scientist-machine learning' 'big data scientist'
 'data scientist big data ml' 'excellent opportunity for data scientist'
 'nlp data scientist' 'data scientist (machine learning)'
 'principal data scientist' 'data scientist machine learning'
 'data scientist - bangalore' 'data scientist - telecom domain'
 'senior data scientist / algorithms specialist'
 'data scientist senior data scientist' 'senior engineer data scientist'
 'chief data scientist' 'data scientist/algorithm model developer'
 'data scientist - machine learning/nlp'
 'data scientist - data modelling - forecasting'
 'principle statistician / data scientist' 'data scientist (big data)'
 'principle statistician

In [8]:
ds_mapping = {title: "data scientist" for title in ds_varitaions}
df['job_title_cleaned'] = df['job_title_cleaned'].replace(ds_mapping)

In [9]:
# Check unique job titles again
print(df['job_title_cleaned'].value_counts().head(20))

job_title_cleaned
business development manager                                              101
business development executive                                             96
software engineer                                                          88
android developer                                                          77
php developer                                                              72
project manager                                                            71
web designer                                                               69
content writer                                                             68
java developer                                                             61
sales executive                                                            60
senior software engineer                                                   60
dot net developer                                                          55
marketing executive                           

In [10]:

# Check only Data Scientist now
print(df[df['job_title_cleaned'] == 'data scientist'].shape[0])


40


In [11]:
data_scientist_jobs = df['job_title_cleaned'][df['job_title_cleaned'].str.startswith('data scientist', na=False)]
print(data_scientist_jobs.unique())

['data scientist']


# Result
All senior, junior, lead, or other variations of Data Scientist are now grouped under data scientist.
Makes aggregation and salary analysis much cleaner.
we can repeat this for other roles like Software Engineer, Frontend Engineer, Backend Engineer, etc., by changing the keyword in str.contains().

In [13]:
print(df.head())

                                      company  \
0                            MM Media Pvt Ltd   
1                          find live infotech   
2         Softtech Career Infosystem Pvt. Ltd   
3                      Onboard HRServices LLP   
4  Spire Technologies and Solutions Pvt. Ltd.   

                                           education   experience  \
0  UG: B.Tech/B.E. - Any Specialization PG:Any Po...    0 - 1 yrs   
1  UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...    0 - 0 yrs   
2  UG: Any Graduate - Any Specialization PG:Any P...    4 - 8 yrs   
3  UG: Any Graduate - Any Specialization PG:CA Do...  11 - 15 yrs   
4  UG: B.Tech/B.E. - Any Specialization PG:Any Po...    6 - 8 yrs   

                                   industry  \
0          Media / Entertainment / Internet   
1  Advertising / PR / MR / Event Management   
2           IT-Software / Software Services   
3    Banking / Financial Services / Broking   
4           IT-Software / Software Services   

       

In [14]:
df.to_csv("day2_cleaned.csv", index=False)