Load the dataset from a CSV file into a Pandas DataFrame.

In [None]:
import pandas as pd

data = pd.read_csv('data.csv')

Get a quick overview of the dataset's structure and data types.

In [None]:
data.info()

Randomly sample 10% of the data for preliminary analysis.

In [None]:
sample_data = data.sample(frac=0.1, random_state=42)

Clean the data by removing rows with missing values.

In [None]:
data_cleaned = data.dropna().reset_index(drop=True)

Tokenize the text data to split it into individual words.

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

tokens = word_tokenize(data_cleaned['text_column'].str.cat(sep=' '))

Tag each token with its corresponding part of speech (POS).

In [None]:
from nltk import pos_tag
pos_tags = pos_tag(tokens)

Visualize the frequency of different part of speech tags.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(x=[tag for word, tag in pos_tags])
plt.show()

Remove common stopwords that don't contribute much to meaning.

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

Analyze word frequency distribution according to Zipf's Law.

In [None]:
from collections import Counter
zipf_counts = Counter(filtered_tokens)
plt.loglog(zipf_counts.values())
plt.show()

Identify the most common words in the filtered dataset.

In [None]:
common_words = zipf_counts.most_common(10)

Prepare data by separating features and the target variable.

In [None]:
X = data_cleaned.drop('target', axis=1)
y = data_cleaned['target']

Split the dataset into training and testing sets.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train a Naive Bayes model using all predictors.

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

Train a Naive Bayes model specifically for job descriptions.

In [None]:
job_description_model = MultinomialNB()
job_description_model.fit(X_train['job_description'], y_train)

Evaluate the model's performance using accuracy metric.

In [None]:
from sklearn.metrics import accuracy_score
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Identify the top indicators for predicting high/low salary.

In [None]:
importances = model.feature_log_prob_
print(importances)

Summarize the outcomes and suggest further action based on the results.

In [None]:
print('Conclusion: Evaluate findings and implications based on the models trained.')