In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(combined_df.columns)


Index(['Here's a simulated interview for a Software Engineer role:\n\n**Interviewer:** Hi Alice, thank you for coming in today. Can you start by telling me a little bit about your background and why you're interested in this Software Engineer role?\n\n**Alice Smith:** Yeah, sure. So, I have a degree in computer science, and I've been working in the field for about three years now. I'm interested in this role because I enjoy coding and problem-solving, and I think this company does some really interesting work.\n\n**Interviewer:** Great, thanks for sharing that. Can you walk me through your experience with a specific programming language? Let's say Java. How have you used Java in previous projects?\n\n**Alice Smith:** Um, I've used Java before...I think. I mean, I've written some Java code, but I don't really remember the specifics. I've worked on a few projects that used Java, but I didn't really have a leading role or anything.\n\n**Interviewer:** I see. In a software engineering role

# Resume screening

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

# Load your dataset
# Replace 'your_dataset.csv' with your actual dataset file
combined_df = pd.read_csv("combined_cleaned_dataset.csv")

# Step 1: Role-specific thresholds for screening
role_thresholds = {
    "Software Engineer": 0.4,
    "Data Scientist": 0.2,
    "UI Engineer": None,  # No threshold
}

# Step 2: Function to calculate TF-IDF and cosine similarity
def calculate_similarity(df, job_col, resume_col):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    # Fit the TF-IDF vectorizer on both job descriptions and resumes
    vectorizer.fit(pd.concat([df[job_col], df[resume_col]]))

    job_desc_vector = vectorizer.transform(df[job_col])
    resume_vector = vectorizer.transform(df[resume_col])
    
    # Compute cosine similarity
    df['similarity'] = cosine_similarity(job_desc_vector, resume_vector).diagonal()
    return df

# Step 3: Apply role-specific thresholds for screening
def screen_resumes(df, role_thresholds):
    decisions = []
    for _, row in df.iterrows():
        role = row['Role']
        similarity = row['similarity']
        threshold = role_thresholds.get(role, None)
        
        # Apply decision logic
        if threshold is None or similarity >= threshold:
            decisions.append("Selected")
        else:
            decisions.append("Rejected")
    
    df['decision'] = decisions
    return df

# Step 4: Add clustering for job descriptions (optional)
def cluster_job_descriptions(df, job_col, n_clusters=5):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X = vectorizer.fit_transform(df[job_col])
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['job_desc_cluster'] = kmeans.fit_predict(X)
    return df

# Step 5: Execute the workflow
combined_df = calculate_similarity(combined_df, 'Job Description', 'Resume')
combined_df = screen_resumes(combined_df, role_thresholds)
combined_df = cluster_job_descriptions(combined_df, 'Job Description')

# Handle missing IDs

combined_df['ID'] = combined_df['ID'].fillna('').astype(str)
missing_ids = combined_df['ID'] == ''
combined_df.loc[missing_ids, 'ID'] = [f'auto_{i}' for i in range(1, missing_ids.sum() + 1)]


# Save and display results
combined_df.to_csv('resume_screening_results.csv', index=False)
print(combined_df[['ID', 'Role', 'similarity', 'decision', 'job_desc_cluster']])


# Save the results to a new CSV file
#combined_df.to_csv('resume_screening_results.csv', index=False)

# Display the output
#print(combined_df[['ID', 'Role', 'similarity', 'decision', 'job_desc_cluster']])


             ID               Role  similarity  decision  job_desc_cluster
0        auto_1  Software Engineer    0.224841  Rejected                 2
1        auto_2  Software Engineer    0.223900  Rejected                 2
2        auto_3     Data Scientist    0.391478  Selected                 2
3        auto_4  Software Engineer    0.272689  Rejected                 2
4        auto_5      Data Engineer    0.371126  Selected                 2
...         ...                ...         ...       ...               ...
3169   ananba44      data engineer    0.328732  Selected                 4
3170  diyasi576    product manager    0.289358  Selected                 4
3171  harska507        ui engineer    0.471586  Selected                 0
3172  kabich225  software engineer    0.329432  Selected                 4
3173  aditsi182  software engineer    0.202067  Selected                 4

[3174 rows x 5 columns]


In [11]:
print(combined_df['decision'].value_counts())


decision
Selected    2823
Rejected     351
Name: count, dtype: int64


In [12]:
print(combined_df.groupby('Role')['decision'].value_counts())


Role                          decision
AI Engineer                   Selected     13
Business Analyst              Selected     11
Cloud Architect               Selected      8
Content Writer                Selected      5
Cybersecurity Specialist      Selected      9
Data Analyst                  Selected     78
Data Engineer                 Selected    192
Data Scientist                Selected    163
                              Rejected    129
Database Administrator        Selected      9
DevOps Engineer               Selected     16
Digital Marketing Specialist  Selected     14
Game Developer                Selected     10
Graphic Designer              Selected     15
HR Specialist                 Selected     12
Machine Learning Engineer     Selected     13
Mobile App Developer          Selected      7
Network Engineer              Selected     16
Product Manager               Selected    203
Project Manager               Selected     56
Software Developer            Selected   

In [13]:
print(combined_df.groupby('Role')['similarity'].mean())


Role
AI Engineer                     0.568689
Business Analyst                0.457339
Cloud Architect                 0.561371
Content Writer                  0.731989
Cybersecurity Specialist        0.572296
Data Analyst                    0.175903
Data Engineer                   0.334887
Data Scientist                  0.251659
Database Administrator          0.572761
DevOps Engineer                 0.401137
Digital Marketing Specialist    0.616506
Game Developer                  0.606549
Graphic Designer                0.423924
HR Specialist                   0.622331
Machine Learning Engineer       0.429377
Mobile App Developer            0.517457
Network Engineer                0.676279
Product Manager                 0.397590
Project Manager                 0.254725
Software Developer              0.069231
Software Engineer               0.221384
System Administrator            0.288313
UI Designer                     0.183592
UI Engineer                     0.379083
UI/UX Desig

In [14]:
# Step 1: Analyze Similarity Scores Across Roles
role_similarity_stats = combined_df.groupby('Role')['similarity'].describe()
print(role_similarity_stats)

# Step 2: Define Role-Specific Thresholds Based on Analysis
role_thresholds = {
    'Software Engineer': 0.4,
    'Data Scientist': 0.2,
    'UI Engineer': None,  # No threshold
    'Data Engineer': 0.3,
    'Product Manager': 0.25,
}

# Step 3: Apply the Thresholds
def apply_role_thresholds(row, thresholds):
    role = row['Role']
    similarity = row['similarity']
    threshold = thresholds.get(role, None)
    
    # If no threshold, default to 'Selected'
    if threshold is None:
        return 'Selected'
    return 'Selected' if similarity >= threshold else 'Rejected'

combined_df['decision'] = combined_df.apply(
    apply_role_thresholds, axis=1, thresholds=role_thresholds
)

# Display the updated DataFrame
print(combined_df[['ID', 'Role', 'similarity', 'decision']])


                              count      mean       std       min       25%  \
Role                                                                          
AI Engineer                    13.0  0.568689  0.078559  0.419033  0.527159   
Business Analyst               11.0  0.457339  0.053995  0.360829  0.431233   
Cloud Architect                 8.0  0.561371  0.027672  0.519828  0.538752   
Content Writer                  5.0  0.731989  0.035762  0.673480  0.732680   
Cybersecurity Specialist        9.0  0.572296  0.072154  0.420210  0.540033   
Data Analyst                   78.0  0.175903  0.105870  0.063601  0.119622   
Data Engineer                 192.0  0.334887  0.170651  0.062845  0.136596   
Data Scientist                292.0  0.251659  0.117716  0.029431  0.153304   
Database Administrator          9.0  0.572761  0.074955  0.414974  0.558291   
DevOps Engineer                16.0  0.401137  0.058758  0.288414  0.361812   
Digital Marketing Specialist   14.0  0.616506  0.055

In [26]:
from sklearn.preprocessing import LabelEncoder

#Separate features and target
X = combined_df.drop(['decision', 'ID'], axis=1, errors='ignore')
y = combined_df['decision']

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Transform the data
X_preprocessed = preprocessor.fit_transform(X)

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Encode target labels

In [29]:
from sklearn.model_selection import train_test_split

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
print(f"Train set size: {X_train.shape}, Test set size: {X_test.shape}")

Train set size: (2539, 11245), Test set size: (635, 11245)


In [35]:
# Manually impute missing values in numeric and categorical columns
X[numeric_cols] = X[numeric_cols].apply(lambda col: col.fillna(col.mean()))
X[categorical_cols] = X[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]))


# SVM model

In [37]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Re-run the train-test split and model training
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

print("\nSVM Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

print("\nSVM Accuracy Score:")
print(accuracy_score(y_test, y_pred_svm))


SVM Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96        98
           1       1.00      0.99      0.99       537

    accuracy                           0.99       635
   macro avg       0.96      0.98      0.97       635
weighted avg       0.99      0.99      0.99       635


SVM Confusion Matrix:
[[ 96   2]
 [  7 530]]

SVM Accuracy Score:
0.9858267716535433


# Predictions

In [39]:
import pickle

# Save the preprocessor
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

# Save the SVM model
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_model, f)


In [47]:
import pandas as pd

# Load the Excel file
new_data = pd.read_excel('Copy of prediction_data.xlsx')  # Adjust the filename if needed

# Display the columns of the new data
print(new_data.columns)


Index(['Unnamed: 0.1', 'Unnamed: 0', 'ID', 'Name', 'Role', 'Transcript',
       'Resume', 'Reason for decision', 'Job Description',
       'num_words_in_transcript'],
      dtype='object')


In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import pandas as pd

# Assuming you have your training data available
# Example: Replace this with your actual training data
X_train = pd.DataFrame({
    'Transcript': ['Interview went well', 'Candidate was not qualified', 'Strong skills in the area', 'Not a good fit for the role'],
    'Job Description': ['Software Engineer role', 'Data Scientist role', 'Developer role', 'HR role']
})

y_train = [1, 0, 1, 0]  # Example labels for training (1: selected, 0: rejected)

# Define your preprocessor (to use TF-IDF for text columns)
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'Transcript'),  # Transform 'Transcript' column using TF-IDF
        ('text2', TfidfVectorizer(), 'Job Description')  # Transform 'Job Description' column using TF-IDF
    ])

# Step 1: Fit the preprocessor using training data
# Fit the preprocessor with the training dataset (X_train)
preprocessor.fit(X_train)

# Step 2: Create and train your SVM model in a pipeline (this combines the preprocessor and the model)
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svm', SVC(kernel='linear'))  # You can change the SVM model as needed
])

# Train the model with the training data
model_pipeline.fit(X_train, y_train)

# Step 3: Load the new data (prediction data)
# Example: Replace this with your actual new dataset for prediction
# Correct the file path by putting it inside quotes
import os
# Ensure the current directory is correct (this should be the directory where your files are located)
current_dir = os.getcwd()

# Set the path for the prediction file (replace with the actual name of your file)
current_dir = os.getcwd()
prediction_file_path = os.path.join(current_dir, 'Copy of prediction_data (1).xlsx')
new_data = pd.read_excel(prediction_file_path)

 # Ensure this path is correct

# Step 4: Process new data (only transformation, not fitting again)
# We don't need to fit the preprocessor again; we just transform the new data.
X_new = new_data[['Transcript', 'Job Description']]  # Ensure new data contains the correct columns

# Step 5: Make predictions using the trained model pipeline (which includes the fitted preprocessor)
y_new_pred = model_pipeline.predict(X_new)

# Step 6: Display the predictions
print("Predictions:", y_new_pred)


Predictions: [1 0 1 0 1 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1
 0 1 0 0 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1
 1 0 1 1 1 0 0 1 1 1 0 1 0 0 1 1 0 1 0 1 1 1 0 1 1 0]


# '1'is for selection
# '0' is for rejection

In [21]:
# Assuming you have already fitted your model (e.g., model_pipeline)
# and that your preprocessor is included in the pipeline
X_new = new_data[['Transcript', 'Job Description']]  # Make sure this contains the right columns
predictions = model_pipeline.predict(X_new)  # Get predictions from the model

# Now you can add predictions to your new_data
new_data['Predictions'] = predictions

# Visualize the results
print(new_data[['Transcript', 'Job Description', 'Predictions']].head())


                                          Transcript  \
0  **lahar singh: software engineer candidate int...   
1  interview transcript: data engineer position\n...   
2  **interview transcript: amisha bedi, data scie...   
3  **interview transcript: product manager positi...   
4  product manager interview transcript\n\ninterv...   

                                     Job Description  Predictions  
0  communicated ideas clearly and effectively., h...            1  
1  we are looking for a skilled data engineer wit...            0  
2  lacked key technical skills for the role., nee...            1  
3  had impressive experience and qualifications.,...            0  
4  we are looking for a skilled product manager w...            1  


In [25]:
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'Actual Outcome' is the true label column in your new_data
y_true = new_data['Predictions']

# Calculate accuracy and other metrics
accuracy = accuracy_score(y_true, predictions)
print(f'Accuracy: {accuracy}')

# Print detailed classification report
print(classification_report(y_true, predictions))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        33
           1       1.00      1.00      1.00        67

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [26]:
# Save the predictions to a new Excel file
new_data.to_excel('predictions_output.xlsx', index=False)


# Insights

In [27]:
# Filter candidates predicted to be selected (assuming 1 means selected and 0 means rejected)
selected_candidates = new_data[new_data['Predictions'] == 1]
rejected_candidates = new_data[new_data['Predictions'] == 0]

print('Selected Candidates:')
print(selected_candidates)


Selected Candidates:
    Unnamed: 0.1  Unnamed: 0          ID            Name               Role  \
0            514         537  rivash0038     lahar singh  software engineer   
2           1408        1467  rivash0968     amisha bedi     data scientist   
4            390         410   bradgr792   bradley gross    product manager   
5           1045        1095  rivash0596   dhanvi raghav       data analyst   
6           1481        1541   anangu781    ananya gupta    product manager   
..           ...         ...         ...             ...                ...   
93           338         354   stepro867  stephanie ross    product manager   
94           414         434   crysga384  crystal garcia  software engineer   
95          1380        1438  rivash0939    charvi verma  software engineer   
97           961        1008  rivash0509      ehan naidu  software engineer   
98           568         595  rivash0096   saanjh chopra  software engineer   

                              

#  Automatic E-mail generation

In [32]:
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

# Example function to send an email
def send_email(receiver_email, subject, body):
    sender_email = "your_email@example.com"  # Your email address
    sender_password = "your_password"  # Your email password (consider using OAuth for security)

    # Create the email headers and body
    message = MIMEMultipart()
    message['From'] = sender_email
    message['To'] = receiver_email
    message['Subject'] = subject

    # Attach the body with the message
    message.attach(MIMEText(body, 'plain'))

    try:
        # Connect to the mail server and send the email
        with smtplib.SMTP('smtp.example.com', 587) as server:
            server.starttls()
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, receiver_email, message.as_string())
        print(f"Email sent to {receiver_email}")
    except Exception as e:
        print(f"Failed to send email to {receiver_email}: {e}")

# 3. Filter the selected candidates from the predictions
selected_candidates = new_data[new_data['Predictions'] == 'Selected']

# 4. Generate automatic emails for the selected candidates
# You would replace 'candidate_emails' with a list of emails (if available)
# For this case, I'll use placeholder email addresses:

candidate_emails = {
    'Candidate 1': 'candidate1@example.com',
    'Candidate 2': 'candidate2@example.com',
    'Candidate 3': 'candidate3@example.com',
    # Add more candidates and emails here
}

subject = "Congratulations on Your Job Interview"
body_template = """
Dear {name},

We are pleased to inform you that you have been selected for the interview process.

Best regards,
Your Company
"""

# Iterate through the selected candidates
for _, candidate in selected_candidates.iterrows():
    candidate_name = candidate['Name']
    if candidate_name in candidate_emails:
        receiver_email = candidate_emails[candidate_name]
        body = body_template.format(name=candidate_name)
        send_email(receiver_email, subject, body)
#Assuming Neha has been selected, and you are sending the email:
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

# SMTP server configuration
smtp_server = "smtp.gmail.com"
smtp_port = 587
sender_email = "your_email@gmail.com"  # Your email address
sender_password = "your_email_password"  # Your email password

# Recipient email address (Neha's email)
receiver_email = "nehabhardwaj3603@gmail.com"

# Create the email content
subject = "Congratulations on Your Job Interview"
body = """
Dear Neha,

We are pleased to inform you that you have been selected for the interview process at Our Company. 
We will reach out to you shortly with further details.

Best regards,
Your Company
"""

# Create the email
message = MIMEMultipart()
message["From"] = sender_email
message["To"] = receiver_email
message["Subject"] = subject

# Attach the body to the email
message.attach(MIMEText(body, "plain"))

# Send the email
try:
    # Set up the SMTP server and send the email
    server = smtplib.SMTP(smtp_server, smtp_port)
    server.starttls()  # Secure the connection
    server.login(sender_email, sender_password)
    server.sendmail(sender_email, receiver_email, message.as_string())
    print(f"Email sent to {receiver_email}")
    server.quit()  # Close the connection
except Exception as e:
    print(f"Failed to send email to {receiver_email}: {str(e)}")


Failed to send email to nehabhardwaj3603@gmail.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials 98e67ed59e1d1-2f7ffa6b2bdsm5310558a91.25 - gsmtp')
