In [2]:
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df=pd.read_csv('fake_job_postings.csv')

In [4]:
import torch 
torch.cuda.get_device_name(0)

'Tesla T4'

In [5]:
#--------1. split the columns to country, state, and city

# to use country only?
df['country']=df.location.str.split(',',expand=True)[:][0]

df['state']=df.location.str.split(',',expand=True)[:][1]
df['city']=df.location.str.split(',',expand=True)[:][2]

In [6]:

#----------2. replacing different kinds of missing value to np.nan
df.state = df.state.str.strip()
df.state.fillna(value=np.nan, inplace=True)
df.state.replace('', np.nan, inplace=True)
df.state.replace(' ', np.nan, inplace=True)


#df['state'].sort_values().unique()

In [7]:
#----------3. clear the whitespaces and signs at the start/end

df.city = df.city.str.strip(' /:\\')

#---------4. replacing different kinds of missing value to np.nan
df.city.fillna(value=np.nan, inplace=True)
df.city.replace('', np.nan, inplace=True)
df.city.replace(' ', np.nan, inplace=True)
df['city']=df.city.str.lower()
#------

#print(df['city'].sort_values().unique().tolist())

In [8]:
df.salary_range.fillna(value=np.nan, inplace=True)
df.salary_range.replace('', np.nan, inplace=True)
df.salary_range.replace(' ', np.nan, inplace=True)

#---------4. spilt salary range into min and max

df['min_salary']=df.salary_range.str.split('-',expand=True)[:][0]
df['max_salary']=df.salary_range.str.split('-',expand=True)[:][1]

df.max_salary.fillna(value=np.nan, inplace=True)

In [9]:
#----------5. for entry of salary_range as date, max and min salary are grouped as null value
df.loc[df['max_salary'].isin(['Apr', 'Dec', 'Jun', 'Nov', 'Oct', 'Sep']),['max_salary', 'min_salary']]=np.nan
df.loc[df['min_salary'].isin(['Dec', 'Jun', 'Oct']),['max_salary', 'min_salary']]=np.nan

#convert them into numerical value
df[['min_salary','max_salary']] = df[['min_salary','max_salary']].astype(float)

#for regression model, need to impute NaN values to median/mean
# df['max_salary'].fillna(value=df['max_salary'].mean(), inplace=True)
# df['min_salary'].fillna(value=df['min_salary'].mean(), inplace=True)

In [None]:
#-------------------end of processing for column salary_range and location

In [None]:
#-------------------start cleaning for text columns

In [10]:
# this function for preprocessing text is used linked from the next function

def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'(?u)\b\w\w+\b')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords



In [11]:
def vectorise_column(df, column):
    # Fill empty columns with "Unspecified"
    df[column] = df[column].fillna("Unspecified")
    
    # Create an instance of TfidfVectorizer
    vectoriser = TfidfVectorizer(analyzer=preprocess_text, ngram_range = (1,2))

    # Fit to the data and transform to feature matrix
    text_column = vectoriser.fit_transform(df[column])

    # Convert sparse matrix to dataframe
    text_column = pd.DataFrame.sparse.from_spmatrix(text_column)

    # Save mapping on which index refers to which words
    col_map = {v:k for k, v in vectoriser.vocabulary_.items()}

    # Rename each column using the mapping
    for col in text_column.columns:
        text_column.rename(columns={col: col_map[col]}, inplace=True)
    
    # Combined to dataframe
    combined = [df, text_column]
    df = pd.concat(combined, axis =1)
    
    return df

In [16]:
# Create new table for text which can be merged to main table later

df_text = df.loc[:, ['company_profile', 'description', 'requirements', 'benefits']]
df_text['text'] = df_text.apply(lambda row: (str(row['company_profile']) + ' ' + str(row['description'])  
                                            + ' ' + str(row['requirements']) + ' ' + str(row['benefits'])), axis = 1)
to_merge = vectorise_column(df_text, 'text')

# Only the following columns need to be merged
# to_merge.iloc[:,4:]

In [25]:
# saving merged df_text as csv
#df_text.to_csv( "df_text.csv", index=False, encoding='utf-8-sig')

Unnamed: 0,company_profile,description,requirements,benefits,text
0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,"We're Food52, and we've created a groundbreaki..."
1,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,"90 Seconds, the worlds Cloud Video Production ..."
2,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,Valor Services provides Workforce Solutions th...
3,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,Our passion for improving quality of life thro...
4,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,SpotSource Solutions LLC is a Global Human Cap...
...,...,...,...,...,...
17875,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,Vend is looking for some awesome new talent to...
17876,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,WebLinc is the e-commerce platform and service...
17877,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,We Provide Full Time Permanent Positions for m...
17878,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,nan Nemsia Studios is looking for an experienc...


In [28]:
#df_final = df.merge(df_text, on = ['company_profile', 'description', 'requirements', 'benefits'],
                    #how = 'inner')
#df_final

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,required_education,industry,function,fraudulent,country,state,city,min_salary,max_salary,text
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,...,,,Marketing,0,US,NY,new york,,,"We're Food52, and we've created a groundbreaki..."
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,...,,Marketing and Advertising,Customer Service,0,NZ,,auckland,,,"90 Seconds, the worlds Cloud Video Production ..."
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,...,,,,0,US,IA,wever,,,Valor Services provides Workforce Solutions th...
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,...,Bachelor's Degree,Computer Software,Sales,0,US,DC,washington,,,Our passion for improving quality of life thro...
4,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,...,Bachelor's Degree,Computer Software,Sales,0,US,DC,washington,,,Our passion for improving quality of life thro...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177801,17874,Recruiting Coordinator,"US, NC, Charlotte",,,,RESPONSIBILITIES:Will facilitate the recruitin...,REQUIRED SKILLS:Associates Degree or a combina...,,0,...,,Utilities,,0,US,NC,charlotte,,,nan RESPONSIBILITIES:Will facilitate the recru...
177802,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,...,,Computer Software,Sales,0,CA,ON,toronto,,,Vend is looking for some awesome new talent to...
177803,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,...,Bachelor's Degree,Internet,Accounting/Auditing,0,US,PA,philadelphia,,,WebLinc is the e-commerce platform and service...
177804,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,...,,,,0,US,TX,houston,,,We Provide Full Time Permanent Positions for m...


In [None]:
#-------------------end cleaning for text columns