In [11]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

train_data_path = 'train1.csv'  # Replace with your file path
test_data_path = 'test1.csv'    # Replace with your file path
val_data_path = 'test_validation.csv'  # Replace with your file path

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
val_data = pd.read_csv(val_data_path)

# Parameters for tokenization
max_words = 10000  # Consider only the top 10,000 words in the dataset

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data['job description'])

# Convert the text in each dataset to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['job description'])
test_sequences = tokenizer.texts_to_sequences(test_data['job description'])
val_sequences = tokenizer.texts_to_sequences(val_data['job description'])

max_len = 100  # Cut off after 100 words

# Extract labels before overwriting the train and validation data
train_labels = np.asarray(train_data['label'].astype('float32'))
val_labels = np.asarray(val_data['label'].astype('float32'))

# Pad sequences
padded_train_data = pad_sequences(train_sequences, maxlen=max_len)
padded_test_data = pad_sequences(test_sequences, maxlen=max_len)
padded_val_data = pad_sequences(val_sequences, maxlen=max_len)


In [5]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=32, input_length=max_len)) # Embedding layer
model.add(LSTM(32))  # LSTM layer with 32 units
model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid activation for binary classification

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(padded_train_data, train_labels, epochs=100, batch_size=64, validation_data=(padded_val_data, val_labels))



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [6]:
# Load test data
test_data_path = 'test1.csv'  # Replace with your file path
test_data = pd.read_csv(test_data_path)

# Tokenize and pad the test data
test_sequences = tokenizer.texts_to_sequences(test_data['job description'])
padded_test_data = pad_sequences(test_sequences, maxlen=max_len)

predicted_labels = model.predict(padded_test_data)
predicted_labels = (predicted_labels > 0.5).astype('int32')  # Convert probabilities to 0 or 1

val_data_path = 'test_validation.csv'  # Replace with your file path
validation_data = pd.read_csv(val_data_path)
actual_labels = np.asarray(validation_data['label'].astype('int32'))

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate metrics
accuracy = accuracy_score(val_labels, predicted_labels)
precision = precision_score(val_labels, predicted_labels)
recall = recall_score(val_labels, predicted_labels)
f1 = f1_score(val_labels, predicted_labels)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Accuracy: 0.9523809523809523
Precision: 0.9411764705882353
Recall: 1.0
F1 Score: 0.9696969696969697


In [14]:
def predict_job_offer(model, tokenizer, job_offer_text, max_len=100):
    # Tokenize and pad the job offer
    sequence = tokenizer.texts_to_sequences([job_offer_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)

    # Predict
    prediction = model.predict(padded_sequence)
    predicted_label = (prediction > 0.5).astype('int32')

    # Interpret the prediction
    if predicted_label[0][0] == 1:
        return "Dangerous/Scam"
    else:
        return "Safe"

# job offer from 999
new_job_offer = "Work in Poland at the poultry meat plant *We employ unqualified persons with biometric passport *Primim women, barabti and pairs *The employment is made legally by a 3-month employment contract *It is possible to extend ducoments up to 3 years *Salary is stable every month + weekly advances *The accommodation is free *It is possible to live only one pair in the room *Hrana 3 times a day for free *Work consists of wrapping and cutting meat *Salary from 20-23 zlotys/hour netto for cutting and slicing products *Salary: 0,20zlot/kg for butchers with environmental experience 1000zlot/monthly *Average monthly salary from 1300-1500€ *Temperature at work +10 *It is worked 10-12 hours a day for months until Saturday (saturday at the wish) *To leave only biometric or European passport we will pay and the road. *Moldova-Poland transport paid by the employer *Contract directly to the employer, you pay nothing for employment you pay nothing for departure Zero expenses - 100% guarantee% The thing is certain because it is legal by contract of employment and you do not have to pay anything . For more details call this phone number"

result = predict_job_offer(model, tokenizer, new_job_offer, max_len)
print(f"The job offer is predicted to be: {result}")


The job offer is predicted to be: Dangerous/Scam
