# Load Library




In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report , confusion_matrix
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt


# Load Data from Dataset

In [4]:
gmail_df = pd.read_csv('gmail_addresses.csv')
# naukri_df = pd.read_csv('large_naukri_data.csv')
job_details_df = pd.read_excel('jobsDetails_20240521090611.xlsx')
linkedin_df = pd.read_csv('LinkedIn_Jobs_Data_India.csv')
glassdoor_df = pd.read_csv('glassdoor job posting.csv')
job_final_df = pd.read_csv('job_final.csv')

Machine Learning

Job Role Classification

1) Data Preprocessing: Preprocess the scraped email addresses and job
 listings data, performing tasks like text cleaning, tokenization, and
 feature extraction.

 2) Model Training: Train a machine learning model (e.g., logistic
 regression, decision trees, or neural networks) to classify email
 addresses based on job roles and domains using the preprocessed data.

 3) Model Evaluation: Evaluate the trained model’s performance using
 appropriate metrics (e.g., accuracy, precision, recall) and fine-tune the
 model as needed.

In [5]:
gmail_df.head()  , gmail_df.isnull().any() , len(gmail_df)

(                  Gmail Address    ID First Name Last Name
 0       anthonylee280@gmail.com  1291    Anthony       Lee
 1  matthewsullivan623@gmail.com  3484    Matthew  Sullivan
 2          garyali683@gmail.com  8769       Gary       Ali
 3     danieltaylor226@gmail.com  3948     Daniel    Taylor
 4   collingonzalez669@gmail.com  6530     Collin  Gonzalez,
 Gmail Address    False
 ID               False
 First Name       False
 Last Name        False
 dtype: bool,
 600)

In [6]:
gmail_df.head(1)

Unnamed: 0,Gmail Address,ID,First Name,Last Name
0,anthonylee280@gmail.com,1291,Anthony,Lee


In [7]:
gmail_df['fullname'] = gmail_df['First Name'] + gmail_df['Last Name']

In [8]:
# Extract domain from email
def extract_domain(email):
    return email.split('@')[-1]

gmail_df['Domain'] = gmail_df['Gmail Address'].apply(extract_domain)

In [None]:
job_details_df.head(1)

Unnamed: 0,Title,Description,Primary Description,Detail URL,Location,Skill,Insight,Job State,Poster Id,Company Name,Company Logo,Created At,Scraped At
0,AI / ML Engineer,Project Role : AI / ML Engineer\n\nProject Rol...,"Accenture in India · Pune, Maharashtra, India ...",https://www.linkedin.com/jobs/view/3931026594,"Pune, Maharashtra, India",9 of 10 skills match your profile - you may be...,"10,001+ employees · IT Services and IT Consulting",LISTED,56056912.0,Accenture in India,https://media.licdn.com/dms/image/D560BAQFLXat...,2024-05-20T19:56:58.000Z,2024-05-21T03:26:14.128Z


In [None]:
linkedin_df.head(1)

Unnamed: 0.1,Unnamed: 0,id,publishedAt,title,companyName,postedTime,applicationsCount,description,contractType,experienceLevel,workType,sector,companyId,city,state,recently_posted_jobs
0,0,3800928381,2024-01-09,Front-end Engineer/Web Developer (SDE 1 &amp; 2),BOX8,3 weeks ago,85.0,"Skill: html5 , ajax , oop , git , postgresql ,...",Full-time,Entry level,Engineering and Information Technology,Staffing and Recruiting,3479477,Bengaluru,Karnataka,No


In [None]:
glassdoor_df.head(1)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Big Data & Analytics Consultant,-1,Are you passionate about analytics? Would you ...,3.9,Amazon\n3.9,Jakarta,10000+ Employees,1994,Company - Public,Internet,Information Technology,$10+ billion (USD)


In [None]:
job_final_df.head(1)

Unnamed: 0.1,Unnamed: 0,url,Position,Company,Location,Job_Description
0,0,https://www.glassdoor.co.in/partner/jobListing...,Software Testing Internship,Smart Food Safe Solutions Inc,– Bengaluru,About the company:\nSmart Food Safe Solutions ...


In [None]:
# Select relevent columns

naukri_relevant = naukri_df[['role', 'industry', 'company_name', 'skills']]
jobsDetails_relevant = job_details_df[['Title', 'Company Name']]
linkedin_relevant = linkedin_df[['title', 'companyName']]
glassdoor_relevant = glassdoor_df[['Job Title', 'Company Name']]
job_final_relevant = job_final_df[['Position', 'Company']]
gmail_relevant = gmail_df[['First Name',	'Last Name'	,'fullname'	,'Domain']]

In [None]:
# Rename columns for consistency
naukri_relevant.columns = ['Role', 'Industry', 'Company', 'Skills']
jobsDetails_relevant.columns = ['Role', 'Company']
linkedin_relevant.columns = ['Role', 'Company']
glassdoor_relevant.columns = ['Role', 'Company']
job_final_relevant.columns = ['Role', 'Company']
gmail_relevant.columns = ['First Name' , 'Last Name' , 'Fullname' , 'Domain']

In [None]:
# Concatenate all job listings data
all_jobs_df = pd.concat([gmail_relevant , naukri_relevant, jobsDetails_relevant, linkedin_relevant,
                          glassdoor_relevant, job_final_relevant], ignore_index=True)

In [9]:
gmail_df.head(1)

Unnamed: 0,Gmail Address,ID,First Name,Last Name,fullname,Domain
0,anthonylee280@gmail.com,1291,Anthony,Lee,AnthonyLee,gmail.com


In [None]:
naukri_relevant['Industry'].isnull().any()

True

In [10]:
# Clean text data
def clean_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = text.lower()
    return text

In [22]:

linkedin_df['title'] = linkedin_df['title'].apply(clean_text)
linkedin_df['description'] = linkedin_df['description'].apply(clean_text)

job_details_df['title'] = job_details_df['Title'].apply(clean_text)
job_details_df['description'] = job_details_df['Description'].apply(clean_text)

glassdoor_df['title'] = glassdoor_df['Job Title'].apply(clean_text)
glassdoor_df['description'] = glassdoor_df['Job Description'].apply(clean_text)

job_final_df['title'] = job_final_df['Position'].apply(clean_text)
job_final_df['description'] = job_final_df['Job_Description'].apply(clean_text)

gmail_df['First Name'] = gmail_df['First Name'].apply(clean_text)
gmail_df['Last Name'] = gmail_df['Last Name'].apply(clean_text)
gmail_df['fullname'] = gmail_df['fullname'].apply(clean_text)
gmail_df['Domain'] = gmail_df['Domain'].apply(clean_text)



In [24]:
# Concatenate job titles and descriptions from all datasets
job_titles = pd.concat([ linkedin_df['title'], job_details_df['title'],
                        glassdoor_df['title'], job_final_df['title']], ignore_index=True)

job_descriptions = pd.concat([linkedin_df['description'], job_details_df['description'],
                              glassdoor_df['description'], job_final_df['description']], ignore_index=True)

# Concatenate Gmail dataframe columns
first_name = pd.concat([gmail_df['First Name']], ignore_index=True)
last_name = pd.concat([gmail_df['Last Name']], ignore_index=True)
fullname = pd.concat([gmail_df['fullname']], ignore_index=True)
domain = pd.concat([gmail_df['Domain']], ignore_index=True)



In [25]:
# Create a combined dataframe for job titles and descriptions
combined_job_df = pd.DataFrame({'title': job_titles, 'description': job_descriptions , 'First Name':first_name , 'Last Name':last_name , 'fullname':fullname , 'Domain':domain})

In [26]:
# Drop any rows with missing values
combined_job_df.dropna(inplace=True)

# Combine title and description into a single feature for TF-IDF
combined_job_df['text'] = combined_job_df['title'] + " " + combined_job_df['description'] + " " + combined_job_df['First Name'] + " " + combined_job_df['Last Name'] + " " + combined_job_df['fullname'] + " " + combined_job_df['Domain']

In [27]:
if len(combined_job_df) > len(gmail_df):
    combined_job_df = combined_job_df.sample(n=len(gmail_df), random_state=42).reset_index(drop=True)


In [28]:
# Step 2: Feature Extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=100000)
X = tfidf_vectorizer.fit_transform(combined_job_df['text']).toarray()


In [29]:
from sklearn.preprocessing import LabelEncoder

In [30]:
# Encode target labels (email domains) # vectorization need # Embedding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(gmail_df['Gmail Address'][:len(X)])

In [31]:
# Step 3: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [33]:
# Predict on the test set
y_pred = model.predict(X_test)

In [34]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.0
Precision: 0.0
Recall: 0.0


In [36]:

# Train a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_pred_dt, average='weighted')
classification_report_dt = classification_report(y_test, y_pred_dt)

print(f"Decision Tree - Accuracy: {accuracy_dt}")
print(f"Decision Tree - Precision: {precision_dt}")
print(f"Decision Tree - Recall: {recall_dt}")
print(f"Decision Tree - Classification Report:\n{classification_report_dt}")

Decision Tree - Accuracy: 0.0
Decision Tree - Precision: 0.0
Decision Tree - Recall: 0.0
Decision Tree - Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       1.0
           2       0.00      0.00      0.00       1.0
           7       0.00      0.00      0.00       1.0
           9       0.00      0.00      0.00       1.0
          12       0.00      0.00      0.00       1.0
          13       0.00      0.00      0.00       0.0
          15       0.00      0.00      0.00       1.0
          20       0.00      0.00      0.00       1.0
          22       0.00      0.00      0.00       1.0
          28       0.00      0.00      0.00       0.0
          33       0.00      0.00      0.00       1.0
          34       0.00      0.00      0.00       0.0
          36       0.00      0.00      0.00       0.0
          38       0.00      0.00      0.00       0.0
          39       0.00      0.00      0.00       1.0
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
classification_report_rf = classification_report(y_test, y_pred_rf)

print(f"Random Forest - Accuracy: {accuracy_rf}")
print(f"Random Forest - Precision: {precision_rf}")
print(f"Random Forest - Recall: {recall_rf}")
print(f"Random Forest - Classification Report:\n{classification_report_rf}")

Random Forest - Accuracy: 0.0
Random Forest - Precision: 0.0
Random Forest - Recall: 0.0
Random Forest - Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       1.0
           2       0.00      0.00      0.00       1.0
           6       0.00      0.00      0.00       0.0
           7       0.00      0.00      0.00       1.0
           9       0.00      0.00      0.00       1.0
          10       0.00      0.00      0.00       0.0
          12       0.00      0.00      0.00       1.0
          14       0.00      0.00      0.00       0.0
          15       0.00      0.00      0.00       1.0
          20       0.00      0.00      0.00       1.0
          22       0.00      0.00      0.00       1.0
          26       0.00      0.00      0.00       0.0
          28       0.00      0.00      0.00       0.0
          33       0.00      0.00      0.00       1.0
          39       0.00      0.00      0.00       1.0
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#  LSTM Model

In [38]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

In [42]:
# Reshape the data for LSTM
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Define LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(128, input_shape=(1, X_train.shape[1]), return_sequences=True))
lstm_model.add(LSTM(64))
lstm_model.add(Dense(len(np.unique(y_labels)), activation='softmax'))

lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-43-95d9cd487402>", line 2, in <cell line: 2>

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1807, in fit

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1151, in train_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1209, in compute_loss

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/compile_utils.py", line 277, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 143, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 270, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 2454, in sparse_categorical_crossentropy

  File "/usr/local/lib/python3.10/dist-packages/keras/src/backend.py", line 5775, in sparse_categorical_crossentropy

Received a label value of 599 which is outside the valid range of [0, 1).  Label values: 63 426 213 405 573 86 126 166 598 458 174 421 439 204 262 336 114 27 382 457 21 333 254 238 475 252 403 356 487 546 302 233 323 291 288 412 502 281 398 508 538 537 287 11 18 148 223 51 135 271 477 40 476 513 364 220 432 599 566 541 312 577 146 261
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_11090]

In [None]:
all_jobs_df['Role'] = all_jobs_df['Role'].apply(lambda x: clean_text(x) if pd.notna(x) else x)
all_jobs_df['Company'] = all_jobs_df['Company'].apply(lambda x: clean_text(x) if pd.notna(x) else x)
all_jobs_df['Industry'] = all_jobs_df['Industry'].apply(lambda x: clean_text(x) if pd.notna(x) else x)
all_jobs_df['Skills'] = all_jobs_df['Skills'].apply(lambda x: clean_text(x) if pd.notna(x) else x)
all_jobs_df['First Name'] = all_jobs_df['First Name'].apply(lambda x: clean_text(x) if pd.notna(x) else x)
all_jobs_df['Last Name'] = all_jobs_df['Last Name'].apply(lambda x: clean_text(x) if pd.notna(x) else x)
all_jobs_df['Fullname'] = all_jobs_df['Fullname'].apply(lambda x: clean_text(x) if pd.notna(x) else x)
all_jobs_df['Domain'] = all_jobs_df['Domain'].apply(lambda x: clean_text(x) if pd.notna(x) else x)


In [12]:
gmail_df.head(1)

Unnamed: 0,Gmail Address,ID,First Name,Last Name,fullname,Domain
0,anthonylee280@gmail.com,1291,nthony,nthony,nthonyee,gmail com


In [None]:
# Fill missing values

all_jobs_df.fillna('', inplace=True) # fillna fill the missinng value by repleacing the NaN value with ' '

In [None]:
len(all_jobs_df)

11457

In [14]:
# Combine relevant text features for model input
gmail_df['text'] = gmail_df['First Name'] + " " + gmail_df['Last Name'] + " " + gmail_df['fullname'] + " " + gmail_df['Domain']


In [15]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=500000)
X_features = tfidf_vectorizer.fit_transform(gmail_df['text'])

# Model Training

In [16]:
# For this example, let's assume we are creating a binary classification based on job roles
gmail_df['label'] = gmail_df['Gmail Address'].apply(lambda x: 1 if 'example_job_role' in x else 0)
y_labels = gmail_df['label']

In [18]:
gmail_df.head()

Unnamed: 0,Gmail Address,ID,First Name,Last Name,fullname,Domain,text,label
0,anthonylee280@gmail.com,1291,nthony,nthony,nthonyee,gmail com,nthony nthony nthonyee gmail com,0
1,matthewsullivan623@gmail.com,3484,atthew,atthew,atthewullivan,gmail com,atthew atthew atthewullivan gmail com,0
2,garyali683@gmail.com,8769,ary,ary,aryli,gmail com,ary ary aryli gmail com,0
3,danieltaylor226@gmail.com,3948,aniel,aniel,anielaylor,gmail com,aniel aniel anielaylor gmail com,0
4,collingonzalez669@gmail.com,6530,ollin,ollin,ollinonzalez,gmail com,ollin ollin ollinonzalez gmail com,0


In [19]:
#  Split the Data
#
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=42)


In [21]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0