# Importing necessary packages and data

In [1]:
import pandas as pd

In [117]:
job_data = pd.read_csv('Uncleaned_DS_jobs.csv')

In [118]:
job_data.head()

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst\n3.1,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna"
1,1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech\n4.2,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
2,2,Data Scientist,$137K-$171K (Glassdoor est.),Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group\n3.8,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1
3,3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON\n3.5,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"


### Right off the bat, we can see some mismatching data types, newline characters and random numbers. Let's fix that

In [119]:
job_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672 entries, 0 to 671
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              672 non-null    int64  
 1   Job Title          672 non-null    object 
 2   Salary Estimate    672 non-null    object 
 3   Job Description    672 non-null    object 
 4   Rating             672 non-null    float64
 5   Company Name       672 non-null    object 
 6   Location           672 non-null    object 
 7   Headquarters       672 non-null    object 
 8   Size               672 non-null    object 
 9   Founded            672 non-null    int64  
 10  Type of ownership  672 non-null    object 
 11  Industry           672 non-null    object 
 12  Sector             672 non-null    object 
 13  Revenue            672 non-null    object 
 14  Competitors        672 non-null    object 
dtypes: float64(1), int64(2), object(12)
memory usage: 78.9+ KB


The salary column is an object type but it would be better (and smarter) to have it as an int value

In [120]:
job_data['Salary Estimate'].unique()

array(['$137K-$171K (Glassdoor est.)', '$75K-$131K (Glassdoor est.)',
       '$79K-$131K (Glassdoor est.)', '$99K-$132K (Glassdoor est.)',
       '$90K-$109K (Glassdoor est.)', '$101K-$165K (Glassdoor est.)',
       '$56K-$97K (Glassdoor est.)', '$79K-$106K (Glassdoor est.)',
       '$71K-$123K (Glassdoor est.)', '$90K-$124K (Glassdoor est.)',
       '$91K-$150K (Glassdoor est.)', '$141K-$225K (Glassdoor est.)',
       '$145K-$225K(Employer est.)', '$79K-$147K (Glassdoor est.)',
       '$122K-$146K (Glassdoor est.)', '$112K-$116K (Glassdoor est.)',
       '$110K-$163K (Glassdoor est.)', '$124K-$198K (Glassdoor est.)',
       '$79K-$133K (Glassdoor est.)', '$69K-$116K (Glassdoor est.)',
       '$31K-$56K (Glassdoor est.)', '$95K-$119K (Glassdoor est.)',
       '$212K-$331K (Glassdoor est.)', '$66K-$112K (Glassdoor est.)',
       '$128K-$201K (Glassdoor est.)', '$138K-$158K (Glassdoor est.)',
       '$80K-$132K (Glassdoor est.)', '$87K-$141K (Glassdoor est.)',
       '$92K-$155K (Glassdo

In [121]:
def extract_salary(val):
    values = val.replace('$', '').replace('-', '').split('K', 2)[:2]
    min_sal = int(values[0]) * 1000
    max_sal = int(values[1]) * 1000

    return f"{min_sal} - {max_sal}"

In [122]:
job_data['Salary Estimate'] = job_data['Salary Estimate'].apply(extract_salary)

In [123]:
job_data['Salary Estimate']

0      137000 - 171000
1      137000 - 171000
2      137000 - 171000
3      137000 - 171000
4      137000 - 171000
            ...       
667    105000 - 167000
668    105000 - 167000
669    105000 - 167000
670    105000 - 167000
671    105000 - 167000
Name: Salary Estimate, Length: 672, dtype: object

I'm making an average salary column just so we can have a solid estimate of the base pay for these jobs. If we were to use this model for further analysis, this would help make better predictions as well

In [124]:
def avg_salary(val):
    values = val.split(' - ')
    min_sal = int(values[0])
    max_sal = int(values[1])

    return (min_sal + max_sal) // 2

In [126]:
job_data['Average Salary Estimate'] = job_data['Salary Estimate'].apply(avg_salary)

In [128]:
column_to_move = job_data.pop('Average Salary Estimate')
job_data.insert(3, 'Average Salary Estimate', column_to_move)

In [195]:
job_data.head()

Unnamed: 0,index,Job Title,Salary Estimate,Average Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,0,Sr Data Scientist,137000 - 171000,154000,The Senior Data Scientist is responsible for d...,3.1,Healthfirst,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna"
1,1,Data Scientist,137000 - 171000,154000,Secure our Nation Ignite your FutureJoin the t...,4.2,ManTech,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
2,2,Data Scientist,137000 - 171000,154000,Analysis Group is one of the largest internati...,3.8,Analysis Group,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1
3,3,Data Scientist,137000 - 171000,154000,Do you have a passion for Data and Machine Le...,3.5,INFICON,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,4,Data Scientist,137000 - 171000,154000,Data ScientistAffinity Solutions Marketing Cl...,2.9,Affinity Solutions,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"


"Job Description" and "Company Name" have new line characters and numbers that aren't really important. In order to extract any useful information from those columns, we have to clean them first

In [133]:
import re

In [219]:
def clean_col(val):
    clean_val = re.sub(r'[^\w\s\n0-9.,]', '', val)
    return clean_val

In [220]:
job_data['Job Description'] = job_data['Job Description'].apply(clean_col)

In [221]:
job_data['Company Name'] = job_data['Company Name'].apply(clean_col)

### Let's see what information we can extract from the job description that can benefit us, if we were to do further analysis

In [None]:
!pip install nltk

In [199]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\natba\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [201]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natba\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [207]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\natba\AppData\Roaming\nltk_data...


True

In [223]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

def tokenize(val):
    tokens = word_tokenize(val)

    tokens = [token.lower() for token in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    return filtered_tokens

In [224]:
job_data['Tokens'] = job_data['Job Description'].apply(tokenize)

In [None]:
column_to_move_2 = job_data.pop('Tokens')

In [None]:
job_data.insert(5, 'Tokens', column_to_move_2)

In [235]:
job_data.head()

Unnamed: 0,index,Job Title,Salary Estimate,Average Salary Estimate,Job Description,Tokens,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,0,Sr Data Scientist,137000 - 171000,154000,The Senior Data Scientist is responsible for d...,"[senior, data, scientist, responsible, definin...",3.1,Healthfirst,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna"
1,1,Data Scientist,137000 - 171000,154000,Secure our Nation Ignite your FutureJoin the t...,"[secure, nation, ignite, futurejoin, top, info...",4.2,ManTech,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
2,2,Data Scientist,137000 - 171000,154000,Analysis Group is one of the largest internati...,"[analysis, group, one, largest, international,...",3.8,Analysis Group,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1
3,3,Data Scientist,137000 - 171000,154000,Do you have a passion for Data and Machine Le...,"[passion, data, machine, learning, dream, work...",3.5,INFICON,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,4,Data Scientist,137000 - 171000,154000,Data ScientistAffinity Solutions Marketing Cl...,"[data, scientistaffinity, solutions, marketing...",2.9,Affinity Solutions,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"
