<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Project | ML: Job offers' Fraud-Detection with NLP

### Imports

In [19]:
import pandas as pd
import numpy as np


import re

In [2]:
data = pd.read_csv('data/fake_job_postings.csv')

## 2. Data preprocessing

### Categorical

Categorical refers to columns of categorical variables which are not whole sentences/paragraphs.

In [3]:
categorical_columns = data[["job_id","employment_type", "required_experience", 
                            "required_education", "industry", "function"]].copy()

In [4]:
categorical_columns = categorical_columns.fillna("other")


In [5]:
for column in categorical_columns.columns[1:]:
    categorical_columns[column] = categorical_columns[column].str.lower()

In [6]:
unique_industries_percentage = categorical_columns['industry'].value_counts()/categorical_columns.shape[0]*100

unique_industries_indexes = list(unique_industries_percentage[unique_industries_percentage >= 1].index)

other_industry = categorical_columns["industry"][~categorical_columns["industry"].isin(unique_industries_indexes)].unique()

categorical_columns["industry"].replace(other_industry,"other", inplace=True)

In [7]:
vals_to_replace = {"not applicable":"other", "mid-senior level":"associate", "director":"executive"}
categorical_columns["required_experience"] = categorical_columns["required_experience"].replace(vals_to_replace)


vals_to_replace = {"information technology and services":"information technologies","internet":"information technologies",
                   "computer software":"information technologies", "telecommunications":"information technologies"}

categorical_columns["industry"] = categorical_columns["industry"].replace(vals_to_replace)


vals_to_replace = {"unspecified":"other","some high school coursework":"high school or equivalent",
                   "vocational - hs diploma":"high school or equivalent", "mid-senior level":"associate",
                   "some college coursework completed":"associate degree", "professional":"vocational",
                   "vocational - degree":"vocational","certification":"vocational"}

categorical_columns["required_education"] = categorical_columns["required_education"].replace(vals_to_replace)


vals_to_replace = {"design":"marketing", "public relations":"marketing", "advertising":"marketing", "art/creative":"marketing", 
                   "financial analyst":"finance", "accounting/auditing":"finance","financial analyst":"finance",
                   "training":"human resources","data analyst":"information technology","supply chain":"production",
                   "manufacturing":"production", "quality assurance":"production","project management":"management",
                   "product management":"management", "strategy/planning": "management","sales":"business development",
                   "general business":"business development", "business analyst":"business development",
                   "purchasing":"business development","research":"research and development",
                   "science":"research and development"}

categorical_columns["function"] = categorical_columns["function"].replace(vals_to_replace)


In [8]:
# Split of localitation column

# new data frame with split value columns 
new = data["location"].str.split(",", n = 2, expand = True) 
  
# making separate columns from new data frame 
data["Country"]= new[0] 
data["Region"]= new[1]
data["City"] = new[2]


data['Country'].value_counts() # La distribución del 3r contry es menor a mil, y luego baja en picado.
top_countries = data['Country'].value_counts().head(4).index
other_countries = data['Country'][~data['Country'].isin(top_countries)].unique()

data['Country'].replace(other_countries, 'OTHER', inplace=True)

categorical_columns.loc[:,'Country'] = data.loc[:,'Country']

In [9]:
categorical_columns_dummies = pd.get_dummies(categorical_columns, columns = categorical_columns.columns[1:])

In [10]:
categorical_columns_dummies.head()

Unnamed: 0,job_id,employment_type_contract,employment_type_full-time,employment_type_other,employment_type_part-time,employment_type_temporary,required_experience_associate,required_experience_entry level,required_experience_executive,required_experience_internship,...,function_marketing,function_other,function_production,function_research and development,function_writing/editing,Country_CA,Country_GB,Country_GR,Country_OTHER,Country_US
0,1,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
1,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,4,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Numerical

In [11]:
numerical_columns = data[['job_id', 'telecommuting', 'has_company_logo', 'has_questions', 'fraudulent']].copy()

In [12]:
salary_rows_in_benefits = data['benefits'].str.extractall(r'(\$\d+\,?\d+)').reset_index()['level_0'].unique()
salary_rows_in_description = data['description'].str.extractall(r'(\$\d+\,?\d+)').reset_index()['level_0'].unique()


data.loc[salary_rows_in_benefits, 'salary_range'] = 1
data.loc[salary_rows_in_description, 'salary_range'] = 1

nan_salaries = data['salary_range'].isnull()
with_salaries = list(data['salary_range'][~data['salary_range'].isnull()].unique())

data['salary_range'] = data['salary_range'].replace(with_salaries, 1)
data['salary_range'] = data['salary_range'].fillna(0)

numerical_columns['salary_range'] = data['salary_range']

In [13]:
numerical_columns.head()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent,salary_range
0,1,0,1,0,0,0.0
1,2,0,1,0,0,1.0
2,3,0,1,0,0,0.0
3,4,0,1,0,0,0.0
4,5,0,1,1,0,0.0


### Text


Text refers to columns of categorical variables which are whole sentences/paragraphs. As we had to tokenize and lemmatize them using spaCy, and iterate over those columns doing it was so demanding, we decided to do the preprocessing of those columns separately in another notebook and directly import the resulting pickle.

### Merge

In [None]:
result = pd.concat([categorical_columns_dummies, numerical_columns, text_colums_umap], axis=1,levels=None ,sort=False)

In [None]:
result.head()

In [None]:
result.drop('job_id',axis =1 ,inplace =True)

In [None]:
result.head()

In [None]:
# result.to_csv('result.csv')