In [1]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [2]:
df.loc[:10, 'salary_year_avg':'salary_hour_avg']

Unnamed: 0,salary_year_avg,salary_hour_avg
0,,
1,,
2,,
3,,
4,,
5,,
6,,
7,,
8,,
9,,


# Fill NaN values with Median

In [6]:
# we need to fill in all the NaN values in the dataframe with Median values
median_salary_year = df['salary_year_avg'].median()

In [7]:
median_salary_hour = df['salary_hour_avg'].median()

In [10]:
# now since we need to replace the NaN values with the median values, we aren't gonna do it on our original dataframe (as best practice)
# so we create a new one and make changes to that
df_filled = df

# we use the fillna function to fill inplace of the na values
df_filled['salary_year_avg'].fillna(median_salary_year)

0         115000.0
1         115000.0
2         115000.0
3         115000.0
4         115000.0
            ...   
785736    115000.0
785737    115000.0
785738    115000.0
785739    115000.0
785740    115000.0
Name: salary_year_avg, Length: 785741, dtype: float64

In [11]:
# we have to assign it to itself in order for the changes to appear in the dataframe
df_filled['salary_year_avg'] = df_filled['salary_year_avg'].fillna(median_salary_year)
df_filled['salary_hour_avg'] = df_filled['salary_hour_avg'].fillna(median_salary_hour)

In [12]:
df_filled.loc[:10, 'salary_year_avg':'salary_hour_avg']

Unnamed: 0,salary_year_avg,salary_hour_avg
0,115000.0,45.98
1,115000.0,45.98
2,115000.0,45.98
3,115000.0,45.98
4,115000.0,45.98
5,115000.0,45.98
6,115000.0,45.98
7,115000.0,45.98
8,115000.0,45.98
9,115000.0,45.98


# Dropping Duplicates

In [15]:
# now we need to drop all the duplicates in the dataframe
df_unique = df_filled

df_unique = df_unique.drop_duplicates()

print('Length of original df:', len(df_filled))
print('Length of drop duplicates df:', len(df_unique))
print('Rows Dropped:', len(df_filled)-len(df_unique))

Length of original df: 785741
Length of drop duplicates df: 785640
Rows Dropped: 101


In [16]:
df_unique = df_unique.drop_duplicates(subset=['job_title', 'company_name'])

print('Length of original df:', len(df_filled))
print('Length of drop duplicates df:', len(df_unique))
print('Rows Dropped:', len(df_filled)-len(df_unique))

Length of original df: 785741
Length of drop duplicates df: 508042
Rows Dropped: 277699
