**NLP-Based Fake Job Posting Detection with PySpark**

Task

Use the attached dataset with NLP to create an alogorthim which automatically flags suspicious posts for review.

In [None]:
# First let's create our PySpark instance
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Fakejobpost").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark


You are working with 1 core(s)


In [None]:
from pyspark.ml.feature import * #CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover, vectorassembler
from pyspark.sql.functions import * #col,udf,regexp_replace,isnull
from pyspark.sql.types import * #StringType,IntegerType
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# For pipeline development
from pyspark.ml import Pipeline

In [None]:
#reading dataset
data_path ="/content/fake_job_postings.csv"
df = spark.read.csv(data_path,inferSchema=True,header=True)
df.show()

+------+--------------------+--------------------+----------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+
|job_id|               title|            location|department| salary_range|     company_profile|         description|        requirements|            benefits|       telecommuting|has_company_logo|has_questions|employment_type|required_experience|  required_education|            industry|            function|fraudulent|
+------+--------------------+--------------------+----------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+
|     1|    Marketing Intern|    U

In [None]:
#show schema
df.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- location: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary_range: string (nullable = true)
 |-- company_profile: string (nullable = true)
 |-- description: string (nullable = true)
 |-- requirements: string (nullable = true)
 |-- benefits: string (nullable = true)
 |-- telecommuting: string (nullable = true)
 |-- has_company_logo: string (nullable = true)
 |-- has_questions: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- required_experience: string (nullable = true)
 |-- required_education: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- function: string (nullable = true)
 |-- fraudulent: string (nullable = true)



In [None]:
# check shape of data
print((df.count(), len(df.columns)))

(17880, 18)


In [None]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()


+------+-----+--------+----------+------------+---------------+-----------+------------+--------+-------------+----------------+-------------+---------------+-------------------+------------------+--------+--------+----------+
|job_id|title|location|department|salary_range|company_profile|description|requirements|benefits|telecommuting|has_company_logo|has_questions|employment_type|required_experience|required_education|industry|function|fraudulent|
+------+-----+--------+----------+------------+---------------+-----------+------------+--------+-------------+----------------+-------------+---------------+-------------------+------------------+--------+--------+----------+
|     0|    0|     346|     11547|       15011|           3308|          1|        2573|    6966|           89|              29|           30|           3292|               6723|              7748|    4831|    6317|       176|
+------+-----+--------+----------+------------+---------------+-----------+------------+----

In [None]:
df.describe()

DataFrame[summary: string, job_id: string, title: string, location: string, department: string, salary_range: string, company_profile: string, description: string, requirements: string, benefits: string, telecommuting: string, has_company_logo: string, has_questions: string, employment_type: string, required_experience: string, required_education: string, industry: string, function: string, fraudulent: string]

## PreProcessing

In [None]:
df.groupBy("fraudulent").count().orderBy(desc("count")).show()


+--------------------+-----+
|          fraudulent|count|
+--------------------+-----+
|                   0|16080|
|                   1|  886|
|                NULL|  176|
|           Full-time|   73|
|Hospital & Health...|   55|
|   Bachelor's Degree|   53|
|         Engineering|   26|
| perform quality ...|   17|
|         Unspecified|   15|
|    Mid-Senior level|   15|
|           Associate|   14|
|               Sales|   14|
|Information Techn...|   13|
|           Marketing|   13|
| passionate about...|   13|
|            Internet|   12|
|   Computer Software|   12|
|      Not Applicable|   11|
|We offer an excel...|   11|
| además con el fi...|   10|
+--------------------+-----+
only showing top 20 rows



**we need to delete all rows except rows with 0 ,1 values**

In [None]:
# select data with 0 ,1 values in fraudulent
df = df.filter("fraudulent IN(0,1)")
#check data again
df.groupBy("fraudulent").count().show()


+----------+-----+
|fraudulent|count|
+----------+-----+
|         0|16080|
|         1|  886|
+----------+-----+



In [None]:
from pyspark.sql.functions import *

# Calculate the null percentages and store columns with more than 20% nulls
cols_to_remove = []

for c in df.columns:
    null_percentage = df.filter(df[c].isNull() | isnan(df[c])).count() / df.count()
    if null_percentage > 0.2:
        cols_to_remove.append(c)

# Show the null percentages
null_counts = df.select([(count(when(isnan(c) | col(c).isNull(), c)) / count(lit(1))).alias(c) for c in df.columns])
null_counts.show()

# Print the columns with more than 20% null values
print("Columns with more than 20% null values:", cols_to_remove)


+------+-----+-------------------+------------------+------------------+------------------+-----------+-------------------+------------------+-------------+----------------+-------------+-------------------+-------------------+------------------+------------------+-------------------+----------+
|job_id|title|           location|        department|      salary_range|   company_profile|description|       requirements|          benefits|telecommuting|has_company_logo|has_questions|    employment_type|required_experience|required_education|          industry|           function|fraudulent|
+------+-----+-------------------+------------------+------------------+------------------+-----------+-------------------+------------------+-------------+----------------+-------------+-------------------+-------------------+------------------+------------------+-------------------+----------+
|   0.0|  0.0|0.01986325592361193|0.6506542496758222|0.8403866556642697|0.1889661676293764|        0.0|0.1515

In [None]:
df_clean=df.drop(*cols_to_remove)

In [None]:
df_clean.show()

+------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+----------+
|job_id|               title|            location|     company_profile|         description|        requirements|telecommuting|has_company_logo|has_questions|employment_type|fraudulent|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+----------+
|     1|    Marketing Intern|    US, NY, New York|We're Food52, and...|Food52, a fast-gr...|Experience with c...|            0|               1|            0|          Other|         0|
|     2|Customer Service ...|      NZ, , Auckland|90 Seconds, the w...|Organised - Focus...|What we expect fr...|            0|               1|            0|      Full-time|         0|
|     3|Commissioning Mac...|       US, IA, Wever|Valor Services pr...

**we still have columns with missing need to handle**

In [None]:
#now for handeling other text columns
# replace nulls with "unspecified" in columns location, company_profile and requiremnets
df_clean = df_clean.withColumn("location", when(df_clean["location"].isNull(), "unspecified").otherwise(df_clean["location"]))\
    .withColumn("company_profile", when(df_clean["company_profile"].isNull(), "unspecified").otherwise(df_clean["company_profile"]))\
    .withColumn("requirements", when(df_clean["requirements"].isNull(), "unspecified").otherwise(df_clean["requirements"]))\
    .withColumn("employment_type", when(df_clean["employment_type"].isNull(), "other").otherwise(df_clean["employment_type"]))

# show the resulting dataframe
df_clean.show()

+------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+----------+
|job_id|               title|            location|     company_profile|         description|        requirements|telecommuting|has_company_logo|has_questions|employment_type|fraudulent|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+----------+
|     1|    Marketing Intern|    US, NY, New York|We're Food52, and...|Food52, a fast-gr...|Experience with c...|            0|               1|            0|          Other|         0|
|     2|Customer Service ...|      NZ, , Auckland|90 Seconds, the w...|Organised - Focus...|What we expect fr...|            0|               1|            0|      Full-time|         0|
|     3|Commissioning Mac...|       US, IA, Wever|Valor Services pr...

In [None]:
# check null values again
df_clean.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_clean.columns]).show()

+------+-----+--------+---------------+-----------+------------+-------------+----------------+-------------+---------------+----------+
|job_id|title|location|company_profile|description|requirements|telecommuting|has_company_logo|has_questions|employment_type|fraudulent|
+------+-----+--------+---------------+-----------+------------+-------------+----------------+-------------+---------------+----------+
|     0|    0|       0|              0|          0|           0|            0|               0|            0|              0|         0|
+------+-----+--------+---------------+-----------+------------+-------------+----------------+-------------+---------------+----------+



In [None]:
df_clean.groupBy("employment_type").count().orderBy(desc("count")).show(10)

+--------------------+-----+
|     employment_type|count|
+--------------------+-----+
|           Full-time|10923|
|               other| 3273|
|            Contract| 1496|
|           Part-time|  709|
|           Temporary|  233|
|               Other|  210|
|              France|   16|
| the London Inter...|   10|
| have full-time a...|    9|
| have full-time a...|    9|
+--------------------+-----+
only showing top 10 rows



In [None]:
df_clean = df_clean.withColumn("employment_type", when((df_clean.employment_type != "Full-time") & (df_clean.employment_type != "Contract")
& (df_clean.employment_type != "Part-time") & (df_clean.employment_type!="Temporary") , "other").otherwise(df_clean.employment_type))

In [None]:
df_clean.groupBy("employment_type").count().orderBy(desc("count")).show(10)

+---------------+-----+
|employment_type|count|
+---------------+-----+
|      Full-time|10923|
|          other| 3605|
|       Contract| 1496|
|      Part-time|  709|
|      Temporary|  233|
+---------------+-----+



In [None]:
df_clean.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- location: string (nullable = true)
 |-- company_profile: string (nullable = true)
 |-- description: string (nullable = true)
 |-- requirements: string (nullable = true)
 |-- telecommuting: string (nullable = true)
 |-- has_company_logo: string (nullable = true)
 |-- has_questions: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- fraudulent: string (nullable = true)



**Validating data types**

In [None]:
# List of columns that should be converted to integer
integer_columns = ['has_company_logo','fraudulent','telecommuting','has_questions']

# Check and convert integer columns
for col_name in integer_columns:
    if df_clean.schema[col_name].dataType != 'integer':
        df_clean = df_clean.withColumn(col_name, col(col_name).cast('integer'))

# Now, the specified columns have been converted to integers
df_clean.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- location: string (nullable = true)
 |-- company_profile: string (nullable = true)
 |-- description: string (nullable = true)
 |-- requirements: string (nullable = true)
 |-- telecommuting: integer (nullable = true)
 |-- has_company_logo: integer (nullable = true)
 |-- has_questions: integer (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- fraudulent: integer (nullable = true)



**Checking for duplicates**

In [None]:

# Group by all columns and count the number of occurrences
duplicate_counts = df_clean.groupBy(df_clean.columns).count()

# Filter for rows where count is greater than 1, indicating duplicates
duplicates = duplicate_counts.filter(duplicate_counts['count'] > 1)

if duplicates.count() > 0:
    print(f"There are {duplicates.count()} duplicates in the DataFrame.")
    duplicates.show()
else:
    print("No duplicates found in the DataFrame.")

No duplicates found in the DataFrame.


## Text Cleaning

**extract country from the whole location**

In [None]:
df_clean = df_clean.withColumn('location', split(col('location'), ',')[0])

In [None]:
df_clean.groupBy("location").count().show()


+--------+-----+
|location|count|
+--------+-----+
|      LT|   23|
|      FI|   29|
|      UA|   13|
|      RO|   44|
|      ZM|    2|
|      NL|  124|
|      PL|   75|
|      AM|    2|
|      MX|    8|
|      EE|   68|
|      CN|   13|
|      AT|   14|
|      RU|   16|
|      IQ|    8|
|      HR|    1|
|      SV|    1|
|      CZ|    6|
|      PT|   14|
|      GH|    1|
|      HK|   77|
+--------+-----+
only showing top 20 rows



In [None]:
columns_to_clean = ["title", "company_profile", "description", "requirements"]

In [None]:
# Show a few sample rows of the DataFrame
# Increase the max_colwidth to display the entire text in each column
df_clean.select(*columns_to_clean).show(5, truncate=False,vertical=True)


-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# Regular expressions to remove URLs and hashtags
url_pattern = r"#\w+|http\S+"
symbol_pattern = r"[^A-Za-z ]+"
html_tags=r"<.*?>"
spaces_pattern=r"\\s+"


# Loop through the columns and clean them
for column in columns_to_clean:
    # Remove URLs
    df_clean = df_clean.withColumn(column, regexp_replace(col(column), url_pattern, ""))

    # Remove hashtags
    df_clean = df_clean.withColumn(column, regexp_replace(col(column), symbol_pattern, ""))

    # Remove htmltags
    df_clean = df_clean.withColumn(column, regexp_replace(col(column), html_tags, ""))

    # Remove extra spaces
    df_clean = df_clean.withColumn(column, regexp_replace(col(column), spaces_pattern, ""))

    # Convert all text to lowercase
    df_clean = df_clean.withColumn(column, lower(col(column)))



In [None]:
df_clean.select(*columns_to_clean).show(5, truncate=False,vertical=True)


-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
df_tokenized=df_clean
# Initialize Tokenizer for each column and create new columns with tokenized text
for column in columns_to_clean:
    tokenizer = RegexTokenizer(inputCol=column, outputCol=column + "_tokens",pattern="\W")
    df_tokenized = tokenizer.transform(df_tokenized)

# Show the resulting DataFrame with tokenized columns
df_tokenized.select(["title_tokens", "company_profile_tokens", "description_tokens", "requirements_tokens"]).show(5, truncate=False,vertical=True)



-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
df_tokenized.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- location: string (nullable = true)
 |-- company_profile: string (nullable = true)
 |-- description: string (nullable = true)
 |-- requirements: string (nullable = true)
 |-- telecommuting: integer (nullable = true)
 |-- has_company_logo: integer (nullable = true)
 |-- has_questions: integer (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- fraudulent: integer (nullable = true)
 |-- title_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- company_profile_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- requirements_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [None]:
df_tokenized=df_tokenized.drop("title", "company_profile", "description", "requirements")

In [None]:
df_tokenized.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- telecommuting: integer (nullable = true)
 |-- has_company_logo: integer (nullable = true)
 |-- has_questions: integer (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- fraudulent: integer (nullable = true)
 |-- title_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- company_profile_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- requirements_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [None]:
df_filterd=df_tokenized
#removing stop words
cols = ["title_tokens","company_profile_tokens","description_tokens","requirements_tokens"]
for col in cols:
    remover = StopWordsRemover(inputCol=col, outputCol=col+"_filtered")
    df_filterd = remover.transform(df_filterd)

In [None]:
df_filterd.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- telecommuting: integer (nullable = true)
 |-- has_company_logo: integer (nullable = true)
 |-- has_questions: integer (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- fraudulent: integer (nullable = true)
 |-- title_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- company_profile_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- requirements_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- company_profile_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description_tokens_filtered: array (nullable = true)
 |    |-- element: string (con

In [None]:
# Show the resulting DataFrame with tokenized columns
df_filterd.show(5, truncate=False,vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
df_filterd=df_filterd.drop("title_tokens", "company_profile_tokens", "description_tokens", "requirements_tokens")

In [None]:
df_filterd.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- telecommuting: integer (nullable = true)
 |-- has_company_logo: integer (nullable = true)
 |-- has_questions: integer (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- fraudulent: integer (nullable = true)
 |-- title_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- company_profile_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- requirements_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)



## Feature vectorization

We will:

Use string indexer to vectorize the employment_type column, location column.

Use:
Hashing TF Vectorizer
TFIDF Vectorizer
WordtoVec Vectorizer
to vectorize the text columns (title, company_profile, description, requirements)

In [None]:
columns_to_vectorize = ["employment_type", "location"]

# Initialize StringIndexer for each column and create new columns with numerical indices
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in columns_to_vectorize]

# Fit and transform the StringIndexer on the DataFrame
indexer_models = [indexer.fit(df_filterd) for indexer in indexers]
df_indexed = df_filterd
for indexer_model in indexer_models:
    df_indexed = indexer_model.transform(df_indexed)


In [None]:
df_indexed.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- telecommuting: integer (nullable = true)
 |-- has_company_logo: integer (nullable = true)
 |-- has_questions: integer (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- fraudulent: integer (nullable = true)
 |-- title_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- company_profile_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- requirements_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- employment_type_index: double (nullable = false)
 |-- location_index: double (nullable = false)



In [None]:
# drop employment_type , location columns
df_indexed = df_indexed.drop(*columns_to_vectorize)


In [None]:
df_indexed.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- telecommuting: integer (nullable = true)
 |-- has_company_logo: integer (nullable = true)
 |-- has_questions: integer (nullable = true)
 |-- fraudulent: integer (nullable = true)
 |-- title_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- company_profile_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- requirements_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- employment_type_index: double (nullable = false)
 |-- location_index: double (nullable = false)



In [None]:
df_indexed.columns[5:9]

['title_tokens_filtered',
 'company_profile_tokens_filtered',
 'description_tokens_filtered',
 'requirements_tokens_filtered']

In [None]:
text_columns = df_indexed.columns[5:9]
df_htf=df_indexed
# Apply TF-IDF vectorization to each text column
for column in text_columns:
    # Hashing TF
    hashingTF = HashingTF(inputCol=column, outputCol=column+"_raw_features")
    #HTF data
    df_htf = hashingTF.transform(df_htf)
    df_htf = df_htf.drop(column)




In [None]:
df_htf.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- telecommuting: integer (nullable = true)
 |-- has_company_logo: integer (nullable = true)
 |-- has_questions: integer (nullable = true)
 |-- fraudulent: integer (nullable = true)
 |-- employment_type_index: double (nullable = false)
 |-- location_index: double (nullable = false)
 |-- title_tokens_filtered_raw_features: vector (nullable = true)
 |-- company_profile_tokens_filtered_raw_features: vector (nullable = true)
 |-- description_tokens_filtered_raw_features: vector (nullable = true)
 |-- requirements_tokens_filtered_raw_features: vector (nullable = true)



In [None]:
df_htf.columns[-4:]

['title_tokens_filtered_raw_features',
 'company_profile_tokens_filtered_raw_features',
 'description_tokens_filtered_raw_features',
 'requirements_tokens_filtered_raw_features']

In [None]:
# TF-IDF
df_IDF = df_htf
cols = df_htf.columns[-4:]
for col in cols:
    idf = IDF(inputCol=col, outputCol=col+"_tfidf")
    idfModel = idf.fit(df_htf)
    df_IDF = idfModel.transform(df_IDF)
    df_IDF = df_IDF.drop(col)



In [None]:
df_IDF.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- telecommuting: integer (nullable = true)
 |-- has_company_logo: integer (nullable = true)
 |-- has_questions: integer (nullable = true)
 |-- fraudulent: integer (nullable = true)
 |-- employment_type_index: double (nullable = false)
 |-- location_index: double (nullable = false)
 |-- title_tokens_filtered_raw_features_tfidf: vector (nullable = true)
 |-- company_profile_tokens_filtered_raw_features_tfidf: vector (nullable = true)
 |-- description_tokens_filtered_raw_features_tfidf: vector (nullable = true)
 |-- requirements_tokens_filtered_raw_features_tfidf: vector (nullable = true)



In [None]:
# handle this upove
df_IDF=df_IDF.drop('location_index')

In [None]:
#drop job_id
df_IDF=df_IDF.drop('job_id')
df_IDF.printSchema()

root
 |-- telecommuting: integer (nullable = true)
 |-- has_company_logo: integer (nullable = true)
 |-- has_questions: integer (nullable = true)
 |-- fraudulent: integer (nullable = true)
 |-- employment_type_index: double (nullable = false)
 |-- title_tokens_filtered_raw_features_tfidf: vector (nullable = true)
 |-- company_profile_tokens_filtered_raw_features_tfidf: vector (nullable = true)
 |-- description_tokens_filtered_raw_features_tfidf: vector (nullable = true)
 |-- requirements_tokens_filtered_raw_features_tfidf: vector (nullable = true)



In [None]:
#for tfidf dataset
features_list = df_IDF.columns
features_list.remove('fraudulent')

# Create your vector assembler object
assembler = VectorAssembler(inputCols=features_list,outputCol='features',handleInvalid="skip")
output_tfidf = assembler.transform(df_IDF).select('features','fraudulent')

#output_tfidf = output_tfidf.withColumnRenamed('fraudulent','label')
output_tfidf.name = 'tfidf'


In [None]:
output_tfidf.printSchema()

root
 |-- features: vector (nullable = true)
 |-- fraudulent: integer (nullable = true)



## model

In [None]:
# Split the data into training and testing sets ( 80% for training, 30% for testing)
train_data, test_data = output_tfidf.randomSplit([0.8, 0.2], seed=42)


In [None]:
label_col="fraudulent"
# Define the list of classifiers to try
classifiers = [
    RandomForestClassifier(labelCol=label_col, featuresCol="features"),
    LogisticRegression(labelCol=label_col, featuresCol="features"),
    NaiveBayes(labelCol=label_col, featuresCol="features")
]

# Initialize variables to store the best model and its AUC score
best_model = None
best_auc = 0.0

# Loop through classifiers and perform cross-validation
for classifier in classifiers:
    # Define a parameter grid for hyperparameter tuning (customize as needed)
    param_grid = ParamGridBuilder().build()

    # Create a cross-validator
    crossval = CrossValidator(estimator=classifier,
                              estimatorParamMaps=param_grid,
                              evaluator=BinaryClassificationEvaluator(),
                              numFolds=5)  # You can adjust the number of folds

    # Fit the model
    cv_model = crossval.fit(train_data)

    # Evaluate the model using AUC
    auc = cv_model.avgMetrics[0]

    # Check if this model is the best so far
    if auc > best_auc:
        best_auc = auc
        best_model = cv_model.bestModel

# Print the best model and its AUC
print("Best Model:")
print(best_model)
print(f"Best AUC: {best_auc}")
