# Mini Project

## Assignment 5

Daniel Chang

### Data

Data came from kaggle.

### Research question

Can we predict if a job posting is fraudulent or not.

In [1]:
import numpy as np
import pandas as pd

import re
import string

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
jobs = pd.read_csv('fake_job_postings.csv')
jobs.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [3]:
jobs.isna().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [4]:
jobs_trimmed = pd.DataFrame()
jobs_trimmed['text'] = jobs['title'] + ' ' + jobs['description']
jobs_trimmed['fraud'] = jobs['fraudulent']
jobs_trimmed = jobs_trimmed[jobs_trimmed['text'].str.len() > 60]
jobs_trimmed.shape

(17861, 2)

In [5]:
def clean(text):
    sw = set(stopwords.words('english'))
    text = re.sub(r'<[^<>]*>', ' ', text)
    text_list = text.split()
    text_words = []

    punct = set(string.punctuation)

    for word in text_list:
        # remove punctuation marks at the beginning
        # of each word
        while len(word) > 0 and word[0] in punct:
            word = word[1:]
        
        # remove punctuation marks at the end of each word
        while len(word) > 0 and word [-1] in punct:
            word = word[:-1]
        
        # rule to eliminate most urls
        if len(word) > 0 and '/' not in word:
            # elimate stopwords
            if word.lower() not in sw:
                # append the word to the text_words list
                text_words.append(word.lower())
        
        cleaner_text = ' '.join(text_words)

    return cleaner_text

In [6]:
jobs_trimmed['text'] = jobs_trimmed['text'].apply(clean)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(jobs_trimmed['text'] , jobs_trimmed['fraud'], test_size= 0.30)

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english')),
                 ('mnb', MultinomialNB())])

param_grid = [{'tfidf__min_df': [5, 10, 20],
               'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
               'tfidf__max_features': [None, 1],
               'mnb__alpha': [0, 0.5, 1]}]

grid = GridSearchCV(estimator = pipe, param_grid= param_grid, cv = 5)
grid.fit(X_train, y_train)
grid.best_params_

{'mnb__alpha': 0,
 'tfidf__max_features': None,
 'tfidf__min_df': 5,
 'tfidf__ngram_range': (1, 3)}

In [21]:
grid.best_estimator_.score(X_train, y_train)

0.9969604863221885

In [22]:
grid.best_estimator_.score(X_test, y_test)

0.9759283448404553