<a href="https://colab.research.google.com/github/pallavi-vishnu/Uncleaned_data/blob/main/glassdoor_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Import necessary packages.

from google.colab import files
from os import environ
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import re
sns.set()

In [10]:
# Import the dataset from kaggle

!pip install -q kaggle


In [11]:
# upload kaggle API key
uploaded = files.upload()

Saving Uncleaned_DS_jobs.csv to Uncleaned_DS_jobs (1).csv


In [12]:
# Convert the csv file into a dataframe to make it more readable.
raw_data = pd.read_csv('Uncleaned_DS_jobs.csv', index_col = 'index')
raw_data.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors'],
      dtype='object')

In [13]:
raw_data.describe() ## Take a look at the descriptive statistics.

Unnamed: 0,Rating,Founded
count,672.0,672.0
mean,3.518601,1635.529762
std,1.410329,756.74664
min,-1.0,-1.0
25%,3.3,1917.75
50%,3.8,1995.0
75%,4.3,2009.0
max,5.0,2019.0


In [14]:
# prints information about a DataFrame including the index dtype and columns, non-null values and memory usage.

raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 672 entries, 0 to 671
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          672 non-null    object 
 1   Salary Estimate    672 non-null    object 
 2   Job Description    672 non-null    object 
 3   Rating             672 non-null    float64
 4   Company Name       672 non-null    object 
 5   Location           672 non-null    object 
 6   Headquarters       672 non-null    object 
 7   Size               672 non-null    object 
 8   Founded            672 non-null    int64  
 9   Type of ownership  672 non-null    object 
 10  Industry           672 non-null    object 
 11  Sector             672 non-null    object 
 12  Revenue            672 non-null    object 
 13  Competitors        672 non-null    object 
dtypes: float64(1), int64(1), object(12)
memory usage: 78.8+ KB


In [15]:
# Look for missing data

raw_data.isna().sum()

Job Title            0
Salary Estimate      0
Job Description      0
Rating               0
Company Name         0
Location             0
Headquarters         0
Size                 0
Founded              0
Type of ownership    0
Industry             0
Sector               0
Revenue              0
Competitors          0
dtype: int64

In [16]:
# Take a look at the variables with object data type.

raw_data.select_dtypes('object').head()



Unnamed: 0_level_0,Job Title,Salary Estimate,Job Description,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue,Competitors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description\n\nThe Senior Data Scientist is re...,Healthfirst\n3.1,"New York, NY","New York, NY",1001 to 5000 employees,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna"
1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future\n\nJoin ...",ManTech\n4.2,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
2,Data Scientist,$137K-$171K (Glassdoor est.),Overview\n\n\nAnalysis Group is one of the lar...,Analysis Group\n3.8,"Boston, MA","Boston, MA",1001 to 5000 employees,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1
3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION:\n\nDo you have a passion for ...,INFICON\n3.5,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"


# We can see that the data type of the column salary estimate is object.It would be convenient to have the salary estimate in numbers.

In [17]:
# Extract numeric values from the salary estimate column and convert it into list of integers


salaries = []
for salary in raw_data['Salary Estimate']:
  numeric = re.findall(r'\d+', salary)
  salaries.append(list(map(int, numeric)))
# Create two lists of minimum salary and maximum salary
minimum_sal = []
maximum_sal = []
for salary in salaries:
  minimum_sal.append(min(salary)*1000)
  maximum_sal.append(max(salary)*1000)

# create new columns for minimum salary and maximum salary in the raw data
raw_data['Minimum Salary'] = minimum_sal
raw_data['Maximum Salary'] = maximum_sal



In [18]:
# Drop the original salary estimate column from the dataset

data1 = raw_data.drop('Salary Estimate', axis = 1)

In [19]:
data1

Unnamed: 0_level_0,Job Title,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Minimum Salary,Maximum Salary
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,Sr Data Scientist,Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst\n3.1,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna",137000,171000
1,Data Scientist,"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech\n4.2,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1,137000,171000
2,Data Scientist,Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group\n3.8,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1,137000,171000
3,Data Scientist,JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON\n3.5,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech...",137000,171000
4,Data Scientist,Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",137000,171000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,Data Scientist,Summary\n\nWe’re looking for a data scientist ...,3.6,TRANZACT\n3.6,"Fort Lee, NJ","Fort Lee, NJ",1001 to 5000 employees,1989,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,-1,105000,167000
668,Data Scientist,Job Description\nBecome a thought leader withi...,-1.0,JKGT,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1,105000,167000
669,Data Scientist,Join a thriving company that is changing the w...,-1.0,AccessHope,"Irwindale, CA",-1,-1,-1,-1,-1,-1,-1,-1,105000,167000
670,Data Scientist,100 Remote Opportunity As an AINLP Data Scient...,5.0,ChaTeck Incorporated\n5.0,"San Francisco, CA","Santa Clara, CA",1 to 50 employees,-1,Company - Private,Advertising & Marketing,Business Services,$1 to $5 million (USD),-1,105000,167000


In [21]:
# Lets take a look at the Size column.

data1['Size'].unique()

array(['1001 to 5000 employees', '5001 to 10000 employees',
       '501 to 1000 employees', '51 to 200 employees', '10000+ employees',
       '201 to 500 employees', '1 to 50 employees', '-1', 'Unknown'],
      dtype=object)

In [40]:
#  Make it more readable by removing the word 'employees'
data1['Size'] = data1['Size'].str.replace(' employees', '')
# Replace the word to with '-'
data1['Size'] = data1['Size'].str.replace('to', '-')


In [37]:
# Now lets take a look at the size column again
data1['Size'].value_counts()

Size
51 - 200        135
1001 - 5000     104
1 - 50           86
201 - 500        85
10000+           80
501 - 1000       77
5001 - 10000     61
-1               27
Unknown          17
Name: count, dtype: int64

**Looks like some of the organizations has a size of -1 and Unkown, which does not make any sense. Since the number of inconsistent values is 44, which is less than 10% of the size of data, we can drop the inconsistent values. Drop the inconsistent data.**

In [41]:
# Find the inconsistent data and drop the rows with inconsistent data
invalid_entries = data1[(data1['Size']=='-1') | (data1['Size']=='Unknown')].index
data2 = data1.drop(invalid_entries)
data2['Size'].value_counts()
# reset the index.
data2.reset_index(drop = True)

Unnamed: 0,Job Title,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Minimum Salary,Maximum Salary
0,Sr Data Scientist,Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst\n3.1,"New York, NY","New York, NY",1001 - 5000,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna",137000,171000
1,Data Scientist,"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech\n4.2,"Chantilly, VA","Herndon, VA",5001 - 10000,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1,137000,171000
2,Data Scientist,Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group\n3.8,"Boston, MA","Boston, MA",1001 - 5000,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1,137000,171000
3,Data Scientist,JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON\n3.5,"Newton, MA","Bad Ragaz, Switzerland",501 - 1000,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech...",137000,171000
4,Data Scientist,Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 - 200,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",137000,171000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,Data Scientist,Criterion Systems seeks a Data Scientist to su...,3.8,"Criterion Systems, Inc.\n3.8","Vienna, VA","Vienna, VA",201 - 500,2005,Company - Private,IT Services,Information Technology,$50 to $100 million (USD),-1,105000,167000
624,Data Scientist,About Foundation Medicine:\n\nFoundation Medic...,4.0,Foundation Medicine\n4.0,"Boston, MA","Cambridge, MA",1001 - 5000,2010,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$100 to $500 million (USD),"Genomic Health, Myriad Genetics, The Broad Ins...",105000,167000
625,Data Scientist,Summary\n\nWe’re looking for a data scientist ...,3.6,TRANZACT\n3.6,"Fort Lee, NJ","Fort Lee, NJ",1001 - 5000,1989,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,-1,105000,167000
626,Data Scientist,100 Remote Opportunity As an AINLP Data Scient...,5.0,ChaTeck Incorporated\n5.0,"San Francisco, CA","Santa Clara, CA",1 - 50,-1,Company - Private,Advertising & Marketing,Business Services,$1 to $5 million (USD),-1,105000,167000


In [42]:
# create a column for average salary for the different job titles
data2['Mean Salary'] = (data2['Maximum Salary']+data2['Minimum Salary'])/2
data2.head()

Unnamed: 0_level_0,Job Title,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Minimum Salary,Maximum Salary,Mean Salary
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,Sr Data Scientist,Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst\n3.1,"New York, NY","New York, NY",1001 - 5000,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna",137000,171000,154000.0
1,Data Scientist,"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech\n4.2,"Chantilly, VA","Herndon, VA",5001 - 10000,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1,137000,171000,154000.0
2,Data Scientist,Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group\n3.8,"Boston, MA","Boston, MA",1001 - 5000,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1,137000,171000,154000.0
3,Data Scientist,JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON\n3.5,"Newton, MA","Bad Ragaz, Switzerland",501 - 1000,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech...",137000,171000,154000.0
4,Data Scientist,Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 - 200,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",137000,171000,154000.0


In [25]:
data2['Company Name'] = data2['Company Name'].str.rstrip('\n1234567890.')
data2['Company Name']

index
0                 Healthfirst
1                     ManTech
2              Analysis Group
3                     INFICON
4          Affinity Solutions
                ...          
665    Criterion Systems, Inc
666       Foundation Medicine
667                  TRANZACT
670      ChaTeck Incorporated
671             1-800-Flowers
Name: Company Name, Length: 645, dtype: object

In [43]:
data2['Job Title'].value_counts()

Job Title
Data Scientist                                            299
Data Engineer                                              26
Senior Data Scientist                                      19
Machine Learning Engineer                                  15
Data Analyst                                               12
                                                         ... 
Data Engineer (Remote)                                      1
Data Science Instructor                                     1
Business Data Analyst                                       1
Purification Scientist                                      1
AI/ML - Machine Learning Scientist, Siri Understanding      1
Name: count, Length: 169, dtype: int64