## Load packages

In [197]:
import numpy as np
import pandas as pd
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModel

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


## Load Data

In [198]:
# Read in data and set index_col to 'job_id'
fake_job_data = pd.read_csv('../dataset/fake_job_postings.csv', index_col = 'job_id')

In [199]:
# print a few lines to preview the dataset
fake_job_data.head(2)

Unnamed: 0_level_0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [4]:
# check dataset shape
fake_job_data.shape

(17880, 17)

In [5]:
# check column types
fake_job_data.dtypes

title                  object
location               object
department             object
salary_range           object
company_profile        object
description            object
requirements           object
benefits               object
telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
function               object
fraudulent              int64
dtype: object

In [8]:
# Check missingness
## Check missing in target variables
target_missing = fake_job_data.fraudulent.isnull().sum()
print(f'# of target missing: {target_missing}')

## Check feature columns missing
fake_job_data.isnull().sum()

# of target missing: 0


title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [89]:
fake_job_data.salary_range[fake_job_data.salary_range.notnull()]

job_id
7          20000-28000
11       100000-120000
16       120000-150000
24       100000-120000
32         50000-65000
             ...      
17845              0-0
17850     80000-100000
17866      18000-20000
17868      18000-19000
17875     80000-100000
Name: salary_range, Length: 2868, dtype: object

In [95]:
fake_job_data['salary_range'].loc[7].split('-')

['20000', '28000']

In [109]:
fake_job_data['salary_range'] = fake_job_data['salary_range'].astype(str)

In [99]:
fake_job_data.salary_mean.head(2)

job_id
1    <class 'float'>
2    <class 'float'>
Name: salary_mean, dtype: object

In [19]:
# Check unbalanceness
## Check target value unbalanceness
target_count = fake_job_data.fraudulent.value_counts()
print(f'Fraudulent proportion: {target_count[1]/fake_job_data.shape[0]*100:.2f}%. The target is highly unbalanced!')

Fraudulent proportion: 4.84%. The target is highly unbalanced!


## Data Preprocessing

### Missing Value Imputation

In [None]:
# Missing value imputation
## Note here missing is not MAR at all since missing might be a great indicator of fraudulent occupation

In [77]:
# Missing value - industry
print(fake_job_data.industry.nunique())
print(fake_job_data.industry.isnull().sum())


131
4903


In [86]:
fake_job_data.groupby('industry')['title'].count().sort_values(ascending = False)

industry
Information Technology and Services    1734
Computer Software                      1376
Internet                               1062
Marketing and Advertising               828
Education Management                    822
                                       ... 
Shipbuilding                              1
Alternative Dispute Resolution            1
Ranching                                  1
Wine and Spirits                          1
Sporting Goods                            1
Name: title, Length: 131, dtype: int64

### Feature Engineering

In [62]:
# Split location feature into three
fake_job_data[['country', 'state', 'city']] = fake_job_data.location.str.split(', ', 2,expand = True)
fake_job_data.groupby('country').size()
fake_job_data[['country', 'state', 'city']] = fake_job_data[['country', 'state', 'city']].astype(str)
fake_job_data[['country', 'state', 'city']] = fake_job_data[['country', 'state', 'city']].replace('', 'Unknown')

## Encoding

### Encoding should be done after the train test split

In [67]:
# Target encoding for country by ratio of fradulent (need to do this after the train)
fake_job_data['country_fraudulent_ratio'] = fake_job_data.groupby('country')['fraudulent'].transform(lambda x: (x == 1).sum()/x.count())

In [69]:
fake_job_data.sort_values('country_fraudulent_ratio', ascending=False)

Unnamed: 0_level_0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,...,employment_type,required_experience,required_education,industry,function,fraudulent,country,state,city,country_fraudulent_ratio
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
860,Executive/Head Chef,"MY, ,",,65000-80000,Le Meridien is situated in the heart of kuala ...,Responsible for all food production including ...,Skills and Specifications: * Must have a pa...,,0,1,...,Contract,Executive,,Hospitality,,1,MY,Unknown,Unknown,0.571429
8035,Otak2 2014 Cohort (Round 1),"MY, 10,",,,,The Otak-Otak Program will be built around a f...,,,0,0,...,,,,,,0,MY,10,Unknown,0.571429
14979,Business Analyst / Solutions Consultant (Malay...,"MY, , Kuala Lumpur - Sentral",Software Products - Solutions,70000-100000,Want to be part of a NZ success story that’s g...,Evaluate and document business needs and techn...,To be successful in this role you will need to...,"We are in an exciting growth phase, if you wou...",0,1,...,Full-time,Mid-Senior level,Associate Degree,Information Technology and Services,Business Analyst,0,MY,Unknown,Kuala Lumpur - Sentral,0.571429
9104,Rooms Division Manager,"MY, ,",,,Awarded by Expatriate Lifestyle Magazine with ...,The Rooms Division Manager is responsible for ...,High school or equivalent education required. ...,,0,1,...,,,,,,1,MY,Unknown,Unknown,0.571429
6636,Executive Chef,"MY, ,",,,,Responsible for all food production including ...,Skills and Specifications: * Must have a pa...,,0,0,...,,,,,,1,MY,Unknown,Unknown,0.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15237,Developer,"GR, I, Athens",Engineering,,Workable is a venture-backed startup making cl...,Workable is a product-driven software company....,"To be considered seriously, you will also have...",Our goal is to create a company where employee...,0,1,...,Full-time,Not Applicable,Bachelor's Degree,Computer Software,Engineering,0,GR,I,Athens,0.000000
1828,Senior Java Developer,"GR, ,",,,Accepted Ltd. was founded in 2004 with the aim...,"As a Senior Java engineer, you will participat...",Minimum 4-6 years of professional development ...,"Competitive salary, and incentives according t...",0,1,...,Full-time,Associate,Bachelor's Degree,Information Technology and Services,Information Technology,0,GR,Unknown,Unknown,0.000000
4438,User Interface Designer,"DE, BE, Berlin",Product,,Babbel enables anyone to learn languages in an...,We are looking for an experienced Interaction ...,A strong and diverse portfolio that demonstrat...,We offer you:Strong impact in designing a prod...,0,1,...,Full-time,,,E-Learning,Product Management,0,DE,BE,Berlin,0.000000
10953,Dutch Market Developer for Uniplaces.com,"PT, 11, Lisbon",,,"We are an international, venture backed team m...",UniPlacesEntrepreneur in Residence for the Dut...,Market knowledge about the Dutch educational s...,What you getDirect contact with the senior tea...,0,1,...,Full-time,Internship,,Internet,,0,PT,11,Lisbon,0.000000


In [184]:
data = {'Group': ['A', 'A', 'B', 'A', 'B', 'B', 'A', 'B'],
        'Value': [10, np.nan, 5, 20, np.nan, 15, 30, 0],
        'Value2': [3,2,np.nan,3,2,2,3,1]}
df = pd.DataFrame(data)

In [185]:
df

Unnamed: 0,Group,Value,Value2
0,A,10.0,3.0
1,A,,2.0
2,B,5.0,
3,A,20.0,3.0
4,B,,2.0
5,B,15.0,2.0
6,A,30.0,3.0
7,B,0.0,1.0


In [186]:
df['ratio'] = df['Value2'] / df['Value']

In [187]:
df

Unnamed: 0,Group,Value,Value2,ratio
0,A,10.0,3.0,0.3
1,A,,2.0,
2,B,5.0,,
3,A,20.0,3.0,0.15
4,B,,2.0,
5,B,15.0,2.0,0.133333
6,A,30.0,3.0,0.1
7,B,0.0,1.0,inf


In [196]:
df.groupby('Group')['ratio'].apply(lambda x: x)

0    0.300000
1         NaN
2         NaN
3    0.150000
4         NaN
5    0.133333
6    0.100000
7         inf
Name: ratio, dtype: float64

In [182]:
data = {
    "Category": ["A", "B", "A", "B", "A", "B"],
    "Value1": [1, 5, 3, 7, 5, 9],
    "Value2": [6, 10, 8, 12, 10, 14]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Custom function to normalize columns within each group
def normalize_within_group(group):
    return (group - group.mean()) / group.std()

# Using groupby and apply to normalize multiple columns
result = df.groupby("Category")[['Value1', 'Value2']].apply(normalize_within_group)

# Display the result
print(result)

   Value1  Value2
0    -1.0    -1.0
1    -1.0    -1.0
2     0.0     0.0
3     0.0     0.0
4     1.0     1.0
5     1.0     1.0


## Text Preprocessing

### Tokenization

In [200]:
MODEL_NAME = "xlm-roberta-large"
roberta_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading (…)lve/main/config.json: 100%|██████████| 616/616 [00:00<00:00, 79.9kB/s]
Downloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 13.6MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 9.10M/9.10M [00:00<00:00, 15.9MB/s]


In [201]:
row = fake_job_data.iloc[0]

In [205]:
row.company_profile

"We're Food52, and we've created a groundbreaking and award-winning cooking site. We support, connect, and celebrate home cooks, and give them everything they need in one place.We have a top editorial, business, and engineering team. We're focused on using technology to find new and better ways to connect people around their specific food interests, and to offer them superb, highly curated information about food and cooking. We attract the most talented home cooks and contributors in the country; we also publish well-known professionals like Mario Batali, Gwyneth Paltrow, and Danny Meyer. And we have partnerships with Whole Foods Market and Random House.Food52 has been named the best food website by the James Beard Foundation and IACP, and has been featured in the New York Times, NPR, Pando Daily, TechCrunch, and on the Today Show.We're located in Chelsea, in New York City."

In [206]:
encoding = roberta_tokenizer(str(row.company_profile), padding='max_length', 
                            truncation=True, max_length=512)

encoding = {key: val for key, val in encoding.items()}

In [212]:
encoding['input_ids']

[0,
 1401,
 25,
 107,
 34562,
 12744,
 4,
 136,
 642,
 25,
 272,
 75935,
 10,
 61585,
 70751,
 214,
 136,
 70318,
 9,
 69986,
 179065,
 1764,
 5,
 1401,
 8060,
 4,
 37067,
 4,
 136,
 176016,
 5368,
 110309,
 7,
 4,
 136,
 8337,
 2856,
 26818,
 1836,
 3871,
 23,
 1632,
 3687,
 5,
 12137,
 765,
 10,
 2663,
 68669,
 4,
 8063,
 4,
 136,
 177907,
 7175,
 5,
 1401,
 25,
 107,
 162393,
 98,
 17368,
 55556,
 47,
 7413,
 3525,
 136,
 11522,
 48322,
 47,
 37067,
 3395,
 10932,
 2363,
 29458,
 15381,
 33946,
 7,
 4,
 136,
 47,
 18645,
 2856,
 133924,
 4,
 103210,
 84553,
 297,
 4677,
 1672,
 15381,
 136,
 179065,
 5,
 1401,
 110281,
 70,
 2684,
 12348,
 297,
 5368,
 110309,
 7,
 136,
 22231,
 22230,
 23,
 70,
 23295,
 74,
 642,
 2843,
 80299,
 5299,
 9,
 69723,
 19,
 50582,
 1884,
 30883,
 199437,
 14,
 4,
 195511,
 12421,
 12530,
 18,
 15555,
 4,
 136,
 139020,
 134816,
 5,
 3493,
 642,
 765,
 165410,
 7,
 678,
 40469,
 133,
 34562,
 7,
 30318,
 136,
 39643,
 306,
 13038,
 5,
 204246,
 12744,
 1

In [213]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [214]:
tfidf = TfidfVectorizer(analyzer="char", ngram_range=(4, 4), min_df=2)

In [216]:
fake_job_data['company_profile'] = fake_job_data['company_profile'].fillna('')

In [217]:
tfidf.fit(fake_job_data['company_profile'])

TfidfVectorizer(analyzer='char', min_df=2, ngram_range=(4, 4))

In [218]:
tfidf.vocabulary_

{"we'r": 64491,
 "e're": 33467,
 "'re ": 4289,
 're f': 55829,
 'e fo': 33203,
 ' foo': 1611,
 'food': 39659,
 'ood5': 52593,
 'od52': 51640,
 'd52,': 31171,
 '52, ': 14630,
 '2, a': 10366,
 ', an': 4959,
 ' and': 940,
 'and ': 24362,
 'nd w': 49360,
 'd we': 30565,
 " we'": 3149,
 "we'v": 64492,
 "e've": 33470,
 "'ve ": 4345,
 've c': 63864,
 'e cr': 33159,
 ' cre': 1248,
 'crea': 29921,
 'reat': 55928,
 'eate': 34832,
 'ated': 25218,
 'ted ': 60858,
 'ed a': 35200,
 'd a ': 30338,
 ' a g': 807,
 'a gr': 21772,
 ' gro': 1709,
 'grou': 40896,
 'roun': 56840,
 'ound': 53390,
 'undb': 63030,
 'ndbr': 49416,
 'dbre': 31830,
 'brea': 27457,
 'reak': 55922,
 'eaki': 34730,
 'akin': 23870,
 'king': 45099,
 'ing ': 43517,
 'ng a': 49823,
 'g an': 39940,
 'nd a': 49338,
 'd aw': 30359,
 ' awa': 1024,
 'awar': 25424,
 'ward': 64415,
 'ard-': 24749,
 'rd-w': 55756,
 'd-wi': 30703,
 '-win': 5949,
 'winn': 64633,
 'inni': 43591,
 'nnin': 50241,
 'ning': 50079,
 'ng c': 49825,
 'g co': 39964,
 ' co

In [219]:
v_trans = tfidf.transform(fake_job_data['company_profile'])

In [221]:
fake_job_data.shape

(17880, 17)

In [231]:
v_trans[0]

<1x70578 sparse matrix of type '<class 'numpy.float64'>'
	with 700 stored elements in Compressed Sparse Row format>

In [234]:
np.where(np.array([-1,0,1]) != 0)[0]

array([0, 2])