In [14]:
#install dependencies
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import scipy.sparse as sp

In [15]:
#upload the csv file 
data = pd.read_csv('fake_job_postings.csv', index_col="job_id")


data.head()


Unnamed: 0_level_0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [17]:
# Initialize necessary tools
lemmatizer = WordNetLemmatizer()
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
encoder = OneHotEncoder(sparse_output=False)

In [18]:
# Define relevant text columns and categorical columns
text_columns = ['description', 'requirements', 'company_profile']
categorical_columns = ['employment_type', 'required_experience', 'required_education']


In [19]:
# Text cleaning and lemmatization function
def clean_and_lemmatize(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize words
    return ' '.join(words)

In [20]:
# Apply cleaning function to text columns
for col in text_columns:
    data[col] = data[col].fillna('').apply(clean_and_lemmatize)

In [21]:
# Create combined text field for TF-IDF vectorization
data["combined_text"] = data["description"] + " " + data["requirements"] + " " + data["company_profile"]


In [22]:
# Apply TF-IDF vectorization
tfidf_matrix = tfidf_vectorizer.fit_transform(data["combined_text"])

# Encode categorical columns and concatenate them with the TF-IDF matrix
encoded_columns = encoder.fit_transform(data[categorical_columns])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out())

# Combine TF-IDF matrix and encoded categorical columns
X = sp.hstack([tfidf_matrix, sp.csr_matrix(encoded_columns)])


In [23]:
# Check the shapes to confirm processing
print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("Encoded categorical data shape:", encoded_columns.shape)
print("Combined feature matrix shape:", X.shape)

TF-IDF matrix shape: (17880, 500)
Encoded categorical data shape: (17880, 28)
Combined feature matrix shape: (17880, 528)


In [24]:
data.head()

Unnamed: 0_level_0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,combined_text
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,Marketing Intern,"US, NY, New York",Marketing,,were food and weve created a groundbreaking an...,food a fastgrowing james beard awardwinning on...,experience with content management system a ma...,,0,1,0,Other,Internship,,,Marketing,0,food a fastgrowing james beard awardwinning on...
2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,second the world cloud video production servic...,organised focused vibrant awesomedo you have a...,what we expect from youyour key responsibility...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,organised focused vibrant awesomedo you have a...
3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,valor service provides workforce solution that...,our client located in houston is actively seek...,implement precommissioning and commissioning p...,,0,1,0,,,,,,0,our client located in houston is actively seek...
4,Account Executive - Washington DC,"US, DC, Washington",Sales,,our passion for improving quality of life thro...,the company esri environmental system research...,education bachelor or master in gi business ad...,Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,the company esri environmental system research...
5,Bill Review Manager,"US, FL, Fort Worth",,,spotsource solution llc is a global human capi...,job title itemization review managerlocation f...,qualificationsrn license in the state of texas...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,job title itemization review managerlocation f...


In [26]:
# Convert the TF-IDF sparse matrix to a dense format and create a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Save the TF-IDF DataFrame as a CSV file
tfidf_df.to_csv('tfidf_matrix.csv', index=False)  # Save to the desired location