# Libraries

In [177]:
import pandas as pd
import numpy as np


import re 

## Spacy ##
import spacy
import spacy_lookups_data
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

## sklearn ##
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer


import umap

import matplotlib.pyplot as plt
# ML Models


from imblearn.under_sampling import NearMiss, RandomUnderSampler, InstanceHardnessThreshold,EditedNearestNeighbours
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
data = pd.read_csv('data/fake_job_postings.csv')

# Categorical

In [3]:
categorical_columns = data[["job_id","employment_type", "required_experience", 
                            "required_education", "industry", "function"]].copy()

In [4]:
categorical_columns = categorical_columns.fillna("other")


In [5]:
for column in categorical_columns.columns[1:]:
    categorical_columns[column] = categorical_columns[column].str.lower()

In [6]:
unique_industries_percentage = categorical_columns['industry'].value_counts()/categorical_columns.shape[0]*100

unique_industries_indexes = list(unique_industries_percentage[unique_industries_percentage >= 1].index)

other_industry = categorical_columns["industry"][~categorical_columns["industry"].isin(unique_industries_indexes)].unique()

categorical_columns["industry"].replace(other_industry,"other", inplace=True)

In [7]:
vals_to_replace = {"not applicable":"other", "mid-senior level":"associate", "director":"executive"}
categorical_columns["required_experience"] = categorical_columns["required_experience"].replace(vals_to_replace)


vals_to_replace = {"information technology and services":"information technologies","internet":"information technologies",
                   "computer software":"information technologies", "telecommunications":"information technologies"}

categorical_columns["industry"] = categorical_columns["industry"].replace(vals_to_replace)


vals_to_replace = {"unspecified":"other","some high school coursework":"high school or equivalent",
                   "vocational - hs diploma":"high school or equivalent", "mid-senior level":"associate",
                   "some college coursework completed":"associate degree", "professional":"vocational",
                   "vocational - degree":"vocational","certification":"vocational"}

categorical_columns["required_education"] = categorical_columns["required_education"].replace(vals_to_replace)


vals_to_replace = {"design":"marketing", "public relations":"marketing", "advertising":"marketing", "art/creative":"marketing", 
                   "financial analyst":"finance", "accounting/auditing":"finance","financial analyst":"finance",
                   "training":"human resources","data analyst":"information technology","supply chain":"production",
                   "manufacturing":"production", "quality assurance":"production","project management":"management",
                   "product management":"management", "strategy/planning": "management","sales":"business development",
                   "general business":"business development", "business analyst":"business development",
                   "purchasing":"business development","research":"research and development",
                   "science":"research and development"}

categorical_columns["function"] = categorical_columns["function"].replace(vals_to_replace)


In [8]:
# Split of localitation column

# new data frame with split value columns 
new = data["location"].str.split(",", n = 2, expand = True) 
  
# making separate columns from new data frame 
data["Country"]= new[0] 
data["Region"]= new[1]
data["City"] = new[2]


data['Country'].value_counts() # La distribución del 3r contry es menor a mil, y luego baja en picado.
top_countries = data['Country'].value_counts().head(4).index
other_countries = data['Country'][~data['Country'].isin(top_countries)].unique()

data['Country'].replace(other_countries, 'OTHER', inplace=True)

categorical_columns.loc[:,'Country'] = data.loc[:,'Country']

In [9]:
categorical_columns_dummies = pd.get_dummies(categorical_columns, columns = categorical_columns.columns[1:])

In [10]:
categorical_columns_dummies.head()

Unnamed: 0,job_id,employment_type_contract,employment_type_full-time,employment_type_other,employment_type_part-time,employment_type_temporary,required_experience_associate,required_experience_entry level,required_experience_executive,required_experience_internship,...,function_marketing,function_other,function_production,function_research and development,function_writing/editing,Country_CA,Country_GB,Country_GR,Country_OTHER,Country_US
0,1,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
1,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,4,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Numerical

In [11]:
numerical_columns = data[['job_id', 'telecommuting', 'has_company_logo', 'has_questions', 'fraudulent']].copy()

In [12]:
salary_rows_in_benefits = data['benefits'].str.extractall(r'(\$\d+\,?\d+)').reset_index()['level_0'].unique()
salary_rows_in_description = data['description'].str.extractall(r'(\$\d+\,?\d+)').reset_index()['level_0'].unique()


data.loc[salary_rows_in_benefits, 'salary_range'] = 1
data.loc[salary_rows_in_description, 'salary_range'] = 1

nan_salaries = data['salary_range'].isnull()
with_salaries = list(data['salary_range'][~data['salary_range'].isnull()].unique())

data['salary_range'] = data['salary_range'].replace(with_salaries, 1)
data['salary_range'] = data['salary_range'].fillna(0)

numerical_columns['salary_range'] = data['salary_range']

In [13]:
numerical_columns.head()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent,salary_range
0,1,0,1,0,0,0.0
1,2,0,1,0,0,1.0
2,3,0,1,0,0,0.0
3,4,0,1,0,0,0.0
4,5,0,1,1,0,0.0


# Text


In [14]:
data_text = data[['job_id', 'title','department', 'company_profile', 'description', 'requirements', 'benefits']].copy()

In [15]:
data_text = data_text.fillna('')
text_columns = data_text[['job_id']]
text_columns['text'] = data_text.iloc[:,1:].apply(lambda row: ''.join(row.values.astype(str)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
text_columns.head()

Unnamed: 0,job_id,text
0,1,"Marketing InternMarketingWe're Food52, and we'..."
1,2,Customer Service - Cloud Video ProductionSucce...
2,3,Commissioning Machinery Assistant (CMA)Valor S...
3,4,Account Executive - Washington DCSalesOur pass...
4,5,Bill Review ManagerSpotSource Solutions LLC is...


In [17]:
def clean_urls(column):
    '''
    This functions takes an string and returns an string 
    with the url removed and the words in lower case.
    '''
    return column.apply(lambda x: x.lower()).apply(lambda x: re.sub('http[s]?://\S+', '', x))



text_columns['text'] = clean_urls(text_columns['text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [18]:
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser']) 
nlp.max_length = 15000000

def string_to_lemma(string):
    '''
    This function takes a sentence and returns the list of all lemma
    '''
    doc = nlp(string)
    l_token = [token.lemma_ for token in doc if not token.is_punct 
               | token.is_space | token.is_stop | token.is_digit & token.is_oov] 
    #singular_token = [token for token in nlp(l_token) if token.tag_ == 'NNPS' | token.tag_ == 'NNS']
    return ' '.join(l_token)


text_columns['text'] = text_columns['text'].apply(lambda row: string_to_lemma(row))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [19]:
text_columns.head()

Unnamed: 0,job_id,text
0,1,marketing internmarketingwe're food52 create g...
1,2,customer service cloud video productionsuccess...
2,3,commission machinery assistant cma)valor servi...
3,4,account executive washington dcsalesour passio...
4,5,bill review managerspotsource solution llc glo...


In [20]:
text_columns['text'].isnull().sum()

0

In [21]:
tfidf = TfidfVectorizer().fit_transform(text_columns['text'])


In [22]:
tfidf

<17880x112575 sparse matrix of type '<class 'numpy.float64'>'
	with 2919660 stored elements in Compressed Sparse Row format>

In [121]:
emmbedding = umap.UMAP(metric='hellinger', random_state=42).fit_transform(tfidf)

In [139]:
embedding

array([[-0.02905885, -5.728115  ],
       [ 3.247583  ,  1.4440734 ],
       [-0.7158655 ,  4.203626  ],
       ...,
       [-1.2177056 ,  2.3436766 ],
       [ 1.8744551 , -1.7942669 ],
       [ 4.5303855 , -1.8023908 ]], dtype=float32)

In [197]:
embedding_positive = embedding+20
"""Ponerlos positivos"""

'Ponerlos positivos'

In [143]:
umap_df = pd.DataFrame(embedding_positive)

In [144]:
umap_df

Unnamed: 0,0,1
0,19.970942,14.271885
1,23.247583,21.444073
2,19.284134,24.203627
3,32.890713,20.208000
4,24.334963,25.121124
...,...,...
17875,24.375195,18.551647
17876,20.846592,22.466179
17877,18.782295,22.343678
17878,21.874454,18.205732


In [145]:
text_colums_umap = pd.concat([text_columns['job_id'], umap_df], axis =1)

# Merge

In [146]:
result = pd.concat([categorical_columns_dummies, numerical_columns, text_colums_umap], axis=1,levels=None ,sort=False)

In [147]:
result.head()

Unnamed: 0,job_id,employment_type_contract,employment_type_full-time,employment_type_other,employment_type_part-time,employment_type_temporary,required_experience_associate,required_experience_entry level,required_experience_executive,required_experience_internship,...,Country_US,job_id.1,telecommuting,has_company_logo,has_questions,fraudulent,salary_range,job_id.2,0,1
0,1,0,0,1,0,0,0,0,0,1,...,1,1,0,1,0,0,0.0,1,19.970942,14.271885
1,2,0,1,0,0,0,0,0,0,0,...,0,2,0,1,0,0,1.0,2,23.247583,21.444073
2,3,0,0,1,0,0,0,0,0,0,...,1,3,0,1,0,0,0.0,3,19.284134,24.203627
3,4,0,1,0,0,0,1,0,0,0,...,1,4,0,1,0,0,0.0,4,32.890713,20.208
4,5,0,1,0,0,0,1,0,0,0,...,1,5,0,1,1,0,0.0,5,24.334963,25.121124


In [148]:
result.drop('job_id',axis =1 ,inplace =True)

In [149]:
result.head()

Unnamed: 0,employment_type_contract,employment_type_full-time,employment_type_other,employment_type_part-time,employment_type_temporary,required_experience_associate,required_experience_entry level,required_experience_executive,required_experience_internship,required_experience_other,...,Country_GR,Country_OTHER,Country_US,telecommuting,has_company_logo,has_questions,fraudulent,salary_range,0,1
0,0,0,1,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0.0,19.970942,14.271885
1,0,1,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,1.0,23.247583,21.444073
2,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0.0,19.284134,24.203627
3,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0.0,32.890713,20.208
4,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,1,1,0,0.0,24.334963,25.121124


In [150]:
# result.to_csv('result.csv')

In [187]:
Ramdom_sample = InstanceHardnessThreshold(random_state =42)

X_rus, y_rus = Ramdom_sample.fit_resample(result.drop('fraudulent', axis = 1), result['fraudulent'])

In [188]:
X_train, X_test, y_train, y_test = train_test_split(result.drop('fraudulent', axis=1),
                                                    result['fraudulent'], random_state = 42, test_size=0.2)
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_rus, y_rus, 
                                                                    random_state=42, test_size=0.2)

In [189]:
multi_layer_perceptron = MLPClassifier(hidden_layer_sizes=(300, ),random_state =42).fit(X_train, y_train)
y_mlp_pred = multi_layer_perceptron.predict(X_test)

print(confusion_matrix(y_test, y_mlp_pred))

print(classification_report(y_test, y_mlp_pred))

"""Datos sin balancear"""



[[3372   23]
 [  60  121]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3395
           1       0.84      0.67      0.74       181

    accuracy                           0.98      3576
   macro avg       0.91      0.83      0.87      3576
weighted avg       0.98      0.98      0.98      3576



'Datos sin balancear'

In [190]:
multi_layer_perceptron = MLPClassifier(hidden_layer_sizes=(300, ),random_state =42).fit(X_train_rus, y_train_rus)
y_mlp_pred_rus = multi_layer_perceptron.predict(X_test_rus)

print(confusion_matrix(y_test_rus, y_mlp_pred_rus))

print(classification_report(y_test_rus, y_mlp_pred_rus))

"""Datos Balanceados"""



[[2304   15]
 [  17  130]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2319
           1       0.90      0.88      0.89       147

    accuracy                           0.99      2466
   macro avg       0.94      0.94      0.94      2466
weighted avg       0.99      0.99      0.99      2466



'Datos Balanceados'

In [191]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score

# Cambiar metodo del CrossValidation para incorporar Shuffle
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)
    #logloss      = log_loss(y_test, y_pred)   # SVC & LinearSVC unable to use cvs

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'specificity': [specificity]
                             #'logloss'      : [logloss]
                            })   # timetaken: to be used for comparison later
    return df_model

In [192]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [193]:


models = {#'gnb': GaussianNB(),
          #'bnb': BernoulliNB(),
          #'mnb': MultinomialNB(),
          #'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          #'svc': SVC(probability=True),
          #'linearsvc': LinearSVC(),
          #'xgboost': GradientBoostingClassifier(),
          #'NN': MLPClassifier(),
           'MLP': MLPClassifier(hidden_layer_sizes=(300, ))
         }

models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])



"""Datos sin balancear"""



'Datos sin balancear'

In [194]:
models_df

Unnamed: 0,model,accuracy,precision,recall,f1score,rocauc,specificity
0,knn,0.977978,0.835406,0.684672,0.757259,0.930308,0.98388
0,decisiontree,0.970637,0.681395,0.69927,0.675016,0.834654,0.984104
0,randomforest,0.978607,0.904695,0.614599,0.72855,0.945131,0.982003
0,MLP,0.974483,0.845215,0.60146,0.7116,0.948421,0.978865


In [195]:
models = {#'gnb': GaussianNB(),
          #'bnb': BernoulliNB(),
          #'mnb': MultinomialNB(),
          #'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          #'svc': SVC(probability=True),
          #'linearsvc': LinearSVC(),
          #'xgboost': GradientBoostingClassifier(),
          #'NN': MLPClassifier(),
           'MLP': MLPClassifier(hidden_layer_sizes=(300, ))
        }

models_df = pd.concat([baseline_report(model, X_train_rus, X_test_rus, y_train_rus, y_test_rus, name) for (name, model) in models.items()])


"""Datos balanceados"""



'Datos balanceados'

In [196]:
models_df

Unnamed: 0,model,accuracy,precision,recall,f1score,rocauc,specificity
0,knn,0.983371,0.95247,0.802506,0.869766,0.960627,0.988894
0,decisiontree,0.981953,0.891829,0.852584,0.877502,0.912698,0.991387
0,randomforest,0.986616,0.984777,0.821989,0.885612,0.989911,0.990175
0,MLP,0.982663,0.938905,0.806537,0.85045,0.977965,0.991405
