In [1]:
# Main libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

# Natural Language Libraries
import re 

## Spacy ##
import spacy
import spacy_lookups_data
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

## sklearn ##
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

# ML Models


### Load Dataset

In [2]:
data = pd.read_csv('data/fake_job_postings.csv')
print(data.shape)
data.head()

(17880, 18)


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


### Subsetting the text part of the DataFrame

In [3]:
### Separate the text columns into two

text = data[['title', 'company_profile', 'description', 'requirements', 'benefits', 'fraudulent']].fillna('')

text['company'] = text[['company_profile', 'description']].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
text['offer'] = text[['title', 'requirements', 'benefits']].apply(lambda row: ''.join(row.values.astype(str)), axis=1)

text.drop(['title', 'company_profile', 'description', 'requirements', 'benefits'], axis=1, inplace=True)
text.head()

Unnamed: 0,fraudulent,company,offer
0,0,"We're Food52, and we've created a groundbreaki...",Marketing InternExperience with content manage...
1,0,"90 Seconds, the worlds Cloud Video Production ...",Customer Service - Cloud Video ProductionWhat ...
2,0,Valor Services provides Workforce Solutions th...,Commissioning Machinery Assistant (CMA)Impleme...
3,0,Our passion for improving quality of life thro...,Account Executive - Washington DCEDUCATION: Ba...
4,0,SpotSource Solutions LLC is a Global Human Cap...,Bill Review ManagerQUALIFICATIONS:RN license i...


In [4]:
# Split the dataset (text) into fraudulent or not

text_fraudulent = text[text['fraudulent'] == 1]
text_not_fraudulent = text[text['fraudulent'] == 0]

In [5]:
## FAST EDA of main Words for text-fraudulent / no and column description
'''
columns = [text_fraudulent["company"], text_fraudulent['offer'], text_not_fraudulent['company'], text_not_fraudulent['offer']] 
for column in columns:
    str_bow_raw = ' '.join([' '.join(column.astype(str).tolist())])
    wordcloud = WordCloud(width=480, height=480, margin=0).generate(str_bow_raw)
 
    # Display the generated image:
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.margins(x=0, y=0)
    plt.show()
'''

'\ncolumns = [text_fraudulent["company"], text_fraudulent[\'offer\'], text_not_fraudulent[\'company\'], text_not_fraudulent[\'offer\']] \nfor column in columns:\n    str_bow_raw = \' \'.join([\' \'.join(column.astype(str).tolist())])\n    wordcloud = WordCloud(width=480, height=480, margin=0).generate(str_bow_raw)\n \n    # Display the generated image:\n    plt.imshow(wordcloud, interpolation=\'bilinear\')\n    plt.axis("off")\n    plt.margins(x=0, y=0)\n    plt.show()\n'

## Text Cleaning

In [6]:
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser']) 
nlp.max_length = 15000000

In [7]:
### Clean company (columns: 'company_profile', 'description')
# Lower and Urls clean
def clean_urls(column):
    '''
    This functions takes an string and returns an string 
    with the url removed and the words in lower case.
    '''
    return column.apply(lambda x: x.lower()).apply(lambda x: re.sub('http[s]?://\S+', '', x))
    
text['company'] = text['company'].apply(lambda x: x.lower())
text['company'] = text['company'].apply(lambda x: re.sub('http[s]?://\S+', '', x))
text['company'].head(3)

0    we're food52, and we've created a groundbreaki...
1    90 seconds, the worlds cloud video production ...
2    valor services provides workforce solutions th...
Name: company, dtype: object

In [8]:
def string_to_lemma(string):
    '''
    This function takes a sentence and returns the list of all lemma
    '''
    doc = nlp(string)
    l_token = [token.lemma_ for token in doc if not token.is_punct 
               | token.is_space | token.is_stop | token.is_digit & token.is_oov]
    #singular_token = [token for token in nlp(l_token) if token.tag_ == 'NNPS' | token.tag_ == 'NNS']
    return ' '.join(l_token)
    

In [9]:
### High Memory consumption ###

text['company'] = text['company'].apply(lambda row: string_to_lemma(row))

### Clean offer columns ('title', 'requirements', 'benefits')

In [19]:
text['offer'] = text['offer'].apply(lambda x: x.lower())
text['offer'] = text['offer'].apply(lambda x: re.sub('http[s]?://\S+', '', x))
text['offer'] = text['offer'].apply(lambda row: string_to_lemma(row))

## TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
tfidf_company = TfidfVectorizer().fit_transform(text['company'])
tfidf_offer = TfidfVectorizer().fit_transform(text['offer'])


In [24]:
print(tfidf_company.shape)
print(tfidf_offer.shape)

(17880, 62561)
(17880, 64119)


## Sparce PCA

In order to reduce the amount of words for ML algorithms it is necessary to perform a reduction of dimensions in the matrices. Therefore and according to the data that we have (with lots of 0) a Sparce PCA was performed.

In [40]:
#### MEMORY ERROR ###

from sklearn.decomposition import SparsePCA

transformer = SparsePCA(n_components=50, random_state=42)
transformer.fit(tfidf_company[0:1000].toarray())
company_transformed = transformer.transform(tfidf_company[0:1000].toarray()) ## Sin subset no hay que pasar el argumento .toarray()



In [41]:
company_transformed

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00050939,  0.01610009,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.00762437,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.00041759,  0.00230736,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00037271,  0.00998639,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [42]:
#### MEMORY ERROR ###

transformer.fit(tfidf_offer[0:1000].toarray())
offer_transformed = transformer.transform(tfidf_offer[0:1000].toarray())



## ML Models

In [63]:
from sklearn.model_selection import train_test_split

#X = pd.concat([pd.DataFrame(company_transformed), pd.DataFrame(offer_transformed)], axis=1) 
## Al hacer el split con 1000 me da X must be non negative
X = pd.concat([pd.DataFrame(tfidf_company), pd.DataFrame(tfidf_offer)], axis=1)
y = text['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [64]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score

def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    #logloss      = log_loss(y_test, y_pred)   # SVC & LinearSVC unable to use cvs

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             #'logloss'      : [logloss]
                            })   # timetaken: to be used for comparison later
    return df_model

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC

In [65]:
models = {'gnb': GaussianNB(),
          'bnb': BernoulliNB(),
          'mnb': MultinomialNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'svc': SVC(probability=True),
          'linearsvc': LinearSVC()
         }

models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])

ValueError: setting an array element with a sequence.

models_df