In [71]:
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [100]:
df=pd.read_csv('sample_data/fake_job_postings.csv')

In [1]:
import torch 
torch.cuda.get_device_name(0)

'Tesla T4'

In [101]:
#--------1. split the columns to country, state, and city

# to use country only?
df['country']=df.location.str.split(',',expand=True)[:][0]

df['state']=df.location.str.split(',',expand=True)[:][1]
df['city']=df.location.str.split(',',expand=True)[:][2]

In [102]:

#----------2. replacing different kinds of missing value to np.nan
df.state = df.state.str.strip()
df.state.fillna(value=np.nan, inplace=True)
df.state.replace('', np.nan, inplace=True)
df.state.replace(' ', np.nan, inplace=True)


#df['state'].sort_values().unique()

In [103]:
#----------3. clear the whitespaces and signs at the start/end

df.city = df.city.str.strip(' /:\\')

#---------4. replacing different kinds of missing value to np.nan
df.city.fillna(value=np.nan, inplace=True)
df.city.replace('', np.nan, inplace=True)
df.city.replace(' ', np.nan, inplace=True)
df['city']=df.city.str.lower()
#------

#print(df['city'].sort_values().unique().tolist())

In [104]:
df.salary_range.fillna(value=np.nan, inplace=True)
df.salary_range.replace('', np.nan, inplace=True)
df.salary_range.replace(' ', np.nan, inplace=True)

#---------4. spilt salary range into min and max

df['min_salary']=df.salary_range.str.split('-',expand=True)[:][0]
df['max_salary']=df.salary_range.str.split('-',expand=True)[:][1]

df.max_salary.fillna(value=np.nan, inplace=True)

In [105]:
#----------5. for entry of salary_range as date, max and min salary are grouped as null value
df.loc[df['max_salary'].isin(['Apr', 'Dec', 'Jun', 'Nov', 'Oct', 'Sep']),['max_salary', 'min_salary']]=np.nan
df.loc[df['min_salary'].isin(['Dec', 'Jun', 'Oct']),['max_salary', 'min_salary']]=np.nan

#convert them into numerical value
df[['min_salary','max_salary']] = df[['min_salary','max_salary']].astype(float)

#for regression model, need to impute NaN values to median/mean
# df['max_salary'].fillna(value=df['max_salary'].mean(), inplace=True)
# df['min_salary'].fillna(value=df['min_salary'].mean(), inplace=True)

In [None]:
#-------------------end of processing for column salary_range and location

In [None]:
#-------------------start cleaning for text columns

In [106]:
# this function for preprocessing text is used linked from the next function

def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'(?u)\b\w\w+\b')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords



In [79]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [136]:
df['text'] = df.apply(lambda row: (str(row['company_profile']) + ' ' + str(row['description'])  
                                            + ' ' + str(row['requirements']) + ' ' 
                                             + str(row['benefits'])), axis = 1)
# Fill empty columns with "Unspecified"
df['text'] = df['text'].fillna("Unspecified")
    
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text, ngram_range = (1,2),
                                min_df= 0.01, max_df= 0.6)

# Fit to the data and transform to feature matrix
text_column = vectoriser.fit_transform(df['text'])

# Convert sparse matrix to dataframe
text_column = pd.DataFrame.sparse.from_spmatrix(text_column)

# Save mapping on which index refers to which words
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}

# Rename each column using the mapping
for col in text_column.columns:
    text_column.rename(columns={col: col_map[col]}, inplace=True)
    

KeyboardInterrupt: ignored

In [99]:
from sklearn import datasets, feature_extraction, decomposition
n, m = text_column.shape
k = 10
tSVD = decomposition.TruncatedSVD(n_components = k, random_state = 2022)
xtr = tSVD.fit_transform(text_column)


In [139]:
tSVD.feature_names_in_

array(['00', '000', '10', ..., 'young', 'zealand', 'zone'], dtype=object)

In [140]:
features = tSVD.feature_names_in_
for i, comp in enumerate(tSVD.components_):
  termsincomp = zip(features, comp)
  sorteditem = sorted(termsincomp, key = lambda x: x[1], reverse = True)[:10]
  print("Concept %d:") 
  for item in sorteditem:
    print(item[0])


Concept %d:
service
business
sales
market
customer
job
design
project
skills
development
Concept %d:
job
get
celta
tesol
tefl
passport
teachers
kid
loan
usd
Concept %d:
customer
mail
service
document
productivity
perform
process
file
increase
communications
Concept %d:
job
apprenticeship
website
wish
career
course
education
search
relevant
apply
Concept %d:
website
job
search
technical
manufacture
engineer
redirect
apply
1500
click
Concept %d:
design
web
mail
apprenticeship
development
software
course
document
enterprise
wish
Concept %d:
market
sales
media
social
website
digital
brand
search
campaign
content
Concept %d:
home
care
people
want
like
look
hours
great
us
get
Concept %d:
shall
digital
media
social
content
conflict
fail
video
per
news
Concept %d:
finance
employment
credit
months
financial
holiday
vacation
disability
status
account


In [119]:
text_column

Unnamed: 0,00,000,10,100,1000,11,12,13,14,15,...,writtenlifting,xml,year,years,yes,yet,york,young,zealand,zone
0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.168078,0.0,0.0,0.000000
1,0.0,0.094592,0.000000,0.037801,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
2,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
3,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.028790,0.018414,0.0,0.0,0.000000,0.0,0.0,0.000000
4,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.022219,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,0.0,0.035881,0.033893,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
17876,0.0,0.000000,0.000000,0.034963,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.033737,0.0,0.0,0.000000,0.0,0.0,0.000000
17877,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.051933,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.049244,0.0,0.0,0.000000,0.0,0.0,0.000000
17878,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000


In [111]:
text_columns = ['company_profile', 'description', 'requirements', 'benefits']

for i in text_columns:
  df[i] = df[i].fillna("unspecified")

df['requirements'] = df['requirements'].fillna("Unspecified")
df['benefits'] = df['benefits'].fillna("Unspecified")
df['company_profile'] = df['company_profile'].fillna("Unspecified")
df['description'] = df['description'].fillna("Unspecified")

df['has_requirements'] = df['company_profile'].apply(lambda x: 0 if x == "unspecified" else 1)
df['has_comp_profile'] = df['description'].apply(lambda x: 0 if x == "unspecified" else 1)
df['has_description'] = df['requirements'].apply(lambda x: 0 if x == "unspecified" else 1)
df['has_benefits'] = df['benefits'].apply(lambda x: 0 if x == "unspecified" else 1)

In [118]:
vectoriser

array([ 0.2351722 , -0.03533241, -0.04471793, -0.01997062, -0.00996489,
       -0.02901655,  0.10870633,  0.12349406,  0.06791446, -0.04269961])

In [114]:
df = pd.concat([df, xtr], axis =1, ignore_index=False)

TypeError: ignored

In [113]:
# x, y split
X = df.drop(columns= ['text', 'job_id', 'fraudulent', 'company_profile', 'description', 'requirements', 'benefits'])
y = df['fraudulent']

KeyError: ignored

In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   job_id               17880 non-null  int64  
 1   title                17880 non-null  object 
 2   location             17534 non-null  object 
 3   department           6333 non-null   object 
 4   salary_range         2868 non-null   object 
 5   company_profile      17880 non-null  object 
 6   description          17880 non-null  object 
 7   requirements         17880 non-null  object 
 8   benefits             17880 non-null  object 
 9   telecommuting        17880 non-null  int64  
 10  has_company_logo     17880 non-null  int64  
 11  has_questions        17880 non-null  int64  
 12  employment_type      14409 non-null  object 
 13  required_experience  10830 non-null  object 
 14  required_education   9775 non-null   object 
 15  industry             12977 non-null 

In [19]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [97]:
X.head()

Unnamed: 0,title,location,department,salary_range,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,...,writtenlifting,xml,year,years,yes,yet,york,young,zealand,zone
0,Marketing Intern,"US, NY, New York",Marketing,,0,1,0,Other,Internship,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.168078,0.0,0.0,0.0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,0,1,0,Full-time,Not Applicable,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,0,1,0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Account Executive - Washington DC,"US, DC, Washington",Sales,,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,...,0.0,0.0,0.02879,0.018414,0.0,0.0,0.0,0.0,0.0,0.0
4,Bill Review Manager,"US, FL, Fort Worth",,,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,...,0.0,0.0,0.0,0.022219,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
#df_final = df.merge(df_text, on = ['company_profile', 'description', 'requirements', 'benefits'],
                    #how = 'inner')
#df_final

In [None]:
#-------------------end cleaning for text columns

In [None]:
#-------------------------start word2Vec----------------------------------------

In [34]:
from gensim.models import KeyedVectors
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)


FileNotFoundError: ignored