#### Imports

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

#### Load datasets

In [4]:
ds_jobs_glassdor = pd.read_csv('../data/cleaned_ds_jobs.csv')
ds_jobs_glassdor.sample()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue,min_salary,max_salary,avg_salary,job_state,same_state,company_age,python,excel,hadoop,spark,aws,tableau,big_data,job_simp,seniority
501,Data Scientist,212-331,Position: Data Scientist\nLocation: Denver\nSt...,3.6,Creative Circle,United States,"Los Angeles, CA",201 to 500 employees,Company - Public,Staffing & Outsourcing,Business Services,Unknown / Non-Applicable,212,331,271,US,0,18,1,0,0,0,0,0,0,data scientist,na


In [10]:
ds_jobs_1 = pd.read_csv('../data/ds_salaries_1.csv')
ds_jobs_1.sample()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
271,271,2021,SE,FT,Computer Vision Engineer,102000,BRL,18907,BR,0,BR,M


In [11]:
ds_jobs_2 = pd.read_csv('../data/ds_salaries_2.csv')
ds_jobs_2.sample()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
3370,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M


#### Exploring data

    Shape

In [13]:
print("DS Jobs 1 shape:", ds_jobs_1.shape)
print("DS Jobs 2 shape:", ds_jobs_2.shape)

DS Jobs 1 shape: (607, 12)
DS Jobs 2 shape: (3755, 11)


DS Jobs 1 has an extra column 'Unnamed: 0', the index when saving a dataset as csv wthout indicating index = False. Let's remove it.

In [14]:
ds_jobs_1 = ds_jobs_1.drop(columns=['Unnamed: 0'])
ds_jobs_1.shape

(607, 11)

    'work_year' column

In [16]:
ds_jobs_1['work_year'].value_counts()

work_year
2022    318
2021    217
2020     72
Name: count, dtype: int64

In [17]:
ds_jobs_2['work_year'].value_counts()

work_year
2023    1785
2022    1664
2021     230
2020      76
Name: count, dtype: int64

    'employment_type' column

In [18]:
ds_jobs_1['employment_type'].value_counts()

employment_type
FT    588
PT     10
CT      5
FL      4
Name: count, dtype: int64

In [20]:
ds_jobs_2['employment_type'].value_counts()

employment_type
FT    3718
PT      17
CT      10
FL      10
Name: count, dtype: int64

    'job_title' column

In [25]:
ds_jobs_1['job_title'].value_counts()

job_title
Data Scientist                              143
Data Engineer                               132
Data Analyst                                 97
Machine Learning Engineer                    41
Research Scientist                           16
Data Science Manager                         12
Data Architect                               11
Big Data Engineer                             8
Machine Learning Scientist                    8
Principal Data Scientist                      7
AI Scientist                                  7
Data Science Consultant                       7
Director of Data Science                      7
Data Analytics Manager                        7
ML Engineer                                   6
Computer Vision Engineer                      6
BI Data Analyst                               6
Lead Data Engineer                            6
Data Engineering Manager                      5
Business Data Analyst                         5
Head of Data                  

In [22]:
ds_jobs_2['job_title'].value_counts()

job_title
Data Engineer                          1040
Data Scientist                          840
Data Analyst                            612
Machine Learning Engineer               289
Analytics Engineer                      103
                                       ... 
Principal Machine Learning Engineer       1
Azure Data Engineer                       1
Manager Data Management                   1
Marketing Data Engineer                   1
Finance Data Analyst                      1
Name: count, Length: 93, dtype: int64

    'company_location' column

In [23]:
ds_jobs_1['company_location'].value_counts()

company_location
US    355
GB     47
CA     30
DE     28
IN     24
FR     15
ES     14
GR     11
JP      6
NL      4
AT      4
PT      4
PL      4
LU      3
PK      3
BR      3
AE      3
MX      3
AU      3
TR      3
DK      3
IT      2
CZ      2
SI      2
RU      2
CH      2
NG      2
CN      2
BE      2
VN      1
EE      1
AS      1
DZ      1
MY      1
MD      1
KE      1
SG      1
CO      1
IR      1
CL      1
MT      1
IL      1
UA      1
IQ      1
RO      1
HR      1
NZ      1
HU      1
HN      1
IE      1
Name: count, dtype: int64

In [24]:
ds_jobs_2['company_location'].value_counts()

company_location
US    3040
GB     172
CA      87
ES      77
IN      58
      ... 
MK       1
BS       1
IR       1
CR       1
MT       1
Name: count, Length: 72, dtype: int64

    'company_size' column

In [26]:
ds_jobs_1['company_size'].value_counts()

company_size
M    326
L    198
S     83
Name: count, dtype: int64

In [27]:
ds_jobs_2['company_size'].value_counts()

company_size
M    3153
L     454
S     148
Name: count, dtype: int64

As both datasets have the same columns and same information in the categorical ones, let's merge them.

In [28]:
concatenated_jobs = pd.concat([ds_jobs_1, ds_jobs_2])
concatenated_jobs.shape

(4362, 11)

Let's drop duplicates and see how much info was repeated

In [29]:
concatenated_jobs = concatenated_jobs.drop_duplicates()
concatenated_jobs.shape

(2668, 11)

In [30]:
concatenated_jobs['company_location'].value_counts()

company_location
US    1936
GB     184
CA      90
DE      61
IN      61
      ... 
DZ       1
MD       1
CL       1
MT       1
EG       1
Name: count, Length: 72, dtype: int64