In [5]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [6]:
# Imports
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load cleaned data
df = pd.read_csv("../data/processed/cleaned_data.csv")


In [7]:
# Combine multiple text columns into one
df['all_text'] = df['title'] + " " + df['location'] + " " + \
                 df['department'] + " " + df['description'] + " " + df['requirements']


In [9]:
# TextBlob sentiment polarity
def get_textblob_sentiment(text):
    return TextBlob(text).sentiment.polarity

df['textblob_polarity'] = df['all_text'].astype(str).apply(get_textblob_sentiment)



In [10]:
vader = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    score = vader.polarity_scores(text)
    return pd.Series([score['neg'], score['neu'], score['pos'], score['compound']])

df[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']] = df['all_text'].astype(str).apply(get_vader_sentiment)



In [12]:
df['all_text'] = df['all_text'].astype(str)  # Ensure all values are strings

keywords = ['money', 'earn', 'click', 'investment', 'urgent', 'opportunity', 'work from home']

for word in keywords:
    df[f'keyword_{word}'] = df['all_text'].str.lower().apply(lambda x: 1 if word in x else 0)


In [13]:
df.to_csv("../data/processed/featured_data.csv", index=False)
print("✅ Feature-engineered data saved!")


✅ Feature-engineered data saved!
