In [None]:

# Load and prepare the data
data = pd.read_csv('/home/rizwan/Desktop/Llama_model/resume-extractor/cleaned_resume_dataset.csv')

# Check for missing values
if data['pdf_name'].isnull().any() or data['skills'].isnull().any() or data['experience'].isnull().any():
    raise ValueError("Data contains missing values.")

# Prepare features and labels
X = data['pdf_name'].astype(str)
y_skills = data['skills'].astype(str)
y_experience = data['experience'].astype(str)

# Encode labels
label_encoder_skills = LabelEncoder()
label_encoder_experience = LabelEncoder()
y_skills_encoded = label_encoder_skills.fit_transform(y_skills)
y_experience_encoded = label_encoder_experience.fit_transform(y_experience)

# Split the data
X_train, X_val, y_train_skills, y_val_skills, y_train_experience, y_val_experience = train_test_split(
    X, y_skills_encoded, y_experience_encoded, test_size=0.2, random_state=42)

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

maxlen = 100  # Maximum length of sequences
X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen)
X_val_padded = pad_sequences(X_val_seq, maxlen=maxlen)

# Confirm shapes and types
print("X_train_padded shape:", X_train_padded.shape)
print("X_val_padded shape:", X_val_padded.shape)
print(type(X_train_padded), type(X_val_padded))  # Ensure they are numpy arrays

# Define model architecture
input_layer = Input(shape=(maxlen,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_layer)

# Check embedding output shape
print("Embedding output shape:", embedding_layer.shape)  # Expected: (None, maxlen, 128)

# LSTM layer
lstm_layer = Bidirectional(LSTM(64, return_sequences=False))(embedding_layer)

# Check LSTM output shape
print("LSTM output shape:", lstm_layer.shape)  # Expected: (None, 128)

# Output layers
output_skills = Dense(len(label_encoder_skills.classes_), activation='softmax', name='skills_output')(lstm_layer)
output_experience = Dense(len(label_encoder_experience.classes_), activation='softmax', name='experience_output')(lstm_layer)

# Create and compile the model
model = Model(inputs=input_layer, outputs=[output_skills, output_experience])
model.compile(
    loss={
        'skills_output': 'sparse_categorical_crossentropy',
        'experience_output': 'sparse_categorical_crossentropy'
    },
    optimizer='adam',
    metrics={
        'skills_output': 'accuracy',
        'experience_output': 'accuracy'  # Add metrics for both outputs
    }
)

# Train the model
model.fit(X_train_padded, [y_train_skills, y_train_experience], 
          validation_data=(X_val_padded, [y_val_skills, y_val_experience]),
          epochs=50, 
          batch_size=32)

# Save the model
model.save('resume_model.keras')



# Step 1: Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# If NLTK data is not downloaded, run this once
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Step 2: Load the Dataset


## Dataset Overview

This dataset contains resume information extracted from various PDFs, and it will be used for the task of cleaning and training a machine learning model. The dataset includes the following columns:

- **pdf_name**: The name of the resume PDF file.
- **skills**: The skills section from the resume, containing details like technical expertise, programming languages, tools, and relevant skill sets.
- **experience**: The professional experience section, which lists past job titles, companies, and roles held, along with a brief description of responsibilities and accomplishments.
                                                     |

### Goals

1. **Data Cleaning**: The dataset requires preprocessing to clean up any inconsistencies, special characters, and formatting issues from the extracted text.
2. **Feature Extraction**: Skills and experience data will be processed to extract key features relevant for model training, such as skill tags and role hierarchies.
3. **Model Training**: The cleaned and structured dataset will be used to train a machine learning model to classify resumes, predict skill sets, or match job profiles based on the resume data.

This dataset provides a rich source of information that combines natural language text from resumes and can be leveraged for various machine learning tasks like skill extraction, experience classification, and job matching.

In [6]:
# Load the dataset
df = pd.read_csv("extracted_data_folder_1.csv")

# Display first few rows to understand structure
df.head()


Unnamed: 0,pdf_name,skills,experience
0,Alfred_Huynh_Resume.pdf,**\n\n1. **Database Development**\n\t* Oracle ...,"**\n\n**1. Database Developer, ZP Group (Clien..."
1,Obafemi_Oshin_Resume.pdf,**\n\n1. Proficient languages:\n\t* Java\n\t* ...,"**\n\n1. **Software Engineer (Freelance)**, Ke..."
2,Lawrence_Chopp_Resume.pdf,**\n\n* Project Management\n* Microsoft Office...,**\n\n**Project Manager - Population Health (J...
3,Syed_Akhtar_Resume.pdf,**\n\n1. Programming languages:\n\t* JavaScrip...,"**\n\n1. **Senior Full-Stack Developer, Accent..."
4,Prashamsha_Pathak_Resume.pdf,**\n\n1. Power BI Dashboards development\n2. D...,"**\n\n1. **SQL Server Developer**, Chitwan Hos..."


# Step 3: Handle Missing Data

In [368]:
# Check for missing values
print(df.isnull().sum())


pdf_name        0
skills         10
experience    154
dtype: int64


In [7]:
# Option 1: Drop rows with missing skills or experience
df_cleaned = df.dropna(subset=['skills', 'experience'])

# # Option 2: Fill missing values with empty strings (if you don't want to drop rows)
# df['skills'].fillna("", inplace=True)
# df['experience'].fillna("", inplace=True)

# Step 4: Remove Duplicates

In [8]:
# Remove duplicate rows (if any)
df_cleaned = df_cleaned.drop_duplicates()
df_cleaned.head()

Unnamed: 0,pdf_name,skills,experience
0,Alfred_Huynh_Resume.pdf,**\n\n1. **Database Development**\n\t* Oracle ...,"**\n\n**1. Database Developer, ZP Group (Clien..."
1,Obafemi_Oshin_Resume.pdf,**\n\n1. Proficient languages:\n\t* Java\n\t* ...,"**\n\n1. **Software Engineer (Freelance)**, Ke..."
2,Lawrence_Chopp_Resume.pdf,**\n\n* Project Management\n* Microsoft Office...,**\n\n**Project Manager - Population Health (J...
3,Syed_Akhtar_Resume.pdf,**\n\n1. Programming languages:\n\t* JavaScrip...,"**\n\n1. **Senior Full-Stack Developer, Accent..."
4,Prashamsha_Pathak_Resume.pdf,**\n\n1. Power BI Dashboards development\n2. D...,"**\n\n1. **SQL Server Developer**, Chitwan Hos..."


# Step 5: Text Cleaning - Remove Special Characters, URLs, etc.

In [9]:
# Function to clean text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters, numbers, and punctuations
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove newline characters and backslashes
    text = text.replace('\n', '').replace('\\', '')
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply cleaning function to skills and experience
df_cleaned['skills'] = df_cleaned['skills'].apply(clean_text)
df_cleaned['experience'] = df_cleaned['experience'].apply(clean_text)

# Display a few cleaned rows to verify
df_cleaned[['pdf_name', 'skills', 'experience']].head()


Unnamed: 0,pdf_name,skills,experience
0,Alfred_Huynh_Resume.pdf,database development oracle sql oracle plsql m...,database developer zp group client adtalem glo...
1,Obafemi_Oshin_Resume.pdf,proficient languages java javascript python c ...,software engineer freelance keller williams re...
2,Lawrence_Chopp_Resume.pdf,project management microsoft office budget man...,project manager population health january pres...
3,Syed_Akhtar_Resume.pdf,programming languages javascript python fronte...,senior fullstack developer accenture remote se...
4,Prashamsha_Pathak_Resume.pdf,power bi dashboards development dax language i...,sql server developer chitwan hospital bharatpu...


# Step 6: Remove Stopwords

In [10]:
# Define stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

# Apply stopword removal to skills and experience
df_cleaned['skills'] = df_cleaned['skills'].apply(remove_stopwords)
df_cleaned['experience'] = df_cleaned['experience'].apply(remove_stopwords)

# Display cleaned text
df_cleaned[['pdf_name', 'skills', 'experience']].head()


Unnamed: 0,pdf_name,skills,experience
0,Alfred_Huynh_Resume.pdf,database development oracle sql oracle plsql m...,database developer zp group client adtalem glo...
1,Obafemi_Oshin_Resume.pdf,proficient languages java javascript python c ...,software engineer freelance keller williams re...
2,Lawrence_Chopp_Resume.pdf,project management microsoft office budget man...,project manager population health january pres...
3,Syed_Akhtar_Resume.pdf,programming languages javascript python fronte...,senior fullstack developer accenture remote se...
4,Prashamsha_Pathak_Resume.pdf,power bi dashboards development dax language i...,sql server developer chitwan hospital bharatpu...


# Step 7: Remove Short or Irrelevant Text

In [11]:
# Filter out rows with less than 3 words in skills or experience
df_cleaned = df_cleaned[df_cleaned['skills'].apply(lambda x: len(x.split()) > 2)]
df_cleaned = df_cleaned[df_cleaned['experience'].apply(lambda x: len(x.split()) > 5)]

# Display the final cleaned dataframe
df_cleaned[['pdf_name', 'skills', 'experience']].head()


Unnamed: 0,pdf_name,skills,experience
0,Alfred_Huynh_Resume.pdf,database development oracle sql oracle plsql m...,database developer zp group client adtalem glo...
1,Obafemi_Oshin_Resume.pdf,proficient languages java javascript python c ...,software engineer freelance keller williams re...
2,Lawrence_Chopp_Resume.pdf,project management microsoft office budget man...,project manager population health january pres...
3,Syed_Akhtar_Resume.pdf,programming languages javascript python fronte...,senior fullstack developer accenture remote se...
4,Prashamsha_Pathak_Resume.pdf,power bi dashboards development dax language i...,sql server developer chitwan hospital bharatpu...


# Step 8: Save the Cleaned Dataset

In [12]:
# Save the cleaned dataframe to a new CSV
df_cleaned.to_csv("cleaned_resume_dataset.csv", index=False)

# Verify the file is saved
print("Cleaned dataset saved successfully.")


Cleaned dataset saved successfully.


# Model Training

## 1. Import Laibraries

In [55]:
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


## 2. Load the data

In [56]:
data = pd.read_csv('/home/rizwan/Desktop/Django/model train/cleaned_resume_dataset.csv')

## 3. Prepare data

In [57]:
X = data['pdf_name'] 
y_skills = data['skills']  
y_experience = data['experience'] 

## 4. Encode the labels

In [58]:
label_encoder_skills = LabelEncoder()
label_encoder_experience = LabelEncoder()
y_skills_encoded = label_encoder_skills.fit_transform(y_skills)
y_experience_encoded = label_encoder_experience.fit_transform(y_experience)


## 5. Split the data into training and validation sets

In [59]:

X_train, X_val, y_train_skills, y_val_skills, y_train_experience, y_val_experience = train_test_split(
    X, y_skills_encoded, y_experience_encoded, test_size=0.2, random_state=42)


## 6. Tokenize the PDF names

In [60]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)


## 7. Pad the sequences



## 8. Define the model for multi-output

In [63]:
print(embedding_layer.shape)


(None, 100, 128)


In [61]:
maxlen = 100  
X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen)
X_val_padded = pad_sequences(X_val_seq, maxlen=maxlen)
input_layer = Input(shape=(maxlen,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=maxlen)(input_layer)
lstm_layer = Bidirectional(LSTM(64, return_sequences=False))(embedding_layer)




ValueError: object __array__ method not producing an array

In [375]:








# Output layers
output_skills = Dense(len(label_encoder_skills.classes_), activation='softmax', name='skills_output')(lstm_layer)
output_experience = Dense(len(label_encoder_experience.classes_), activation='softmax', name='experience_output')(lstm_layer)

# Create the model
model = Model(inputs=input_layer, outputs=[output_skills, output_experience])

# Compile the model with metrics for both outputs
model.compile(
    loss={'skills_output': 'sparse_categorical_crossentropy', 'experience_output': 'sparse_categorical_crossentropy'},
    optimizer='adam',
    metrics=['accuracy', 'accuracy']  # Provide accuracy for both outputs
)

# Train the model
model.fit(
    X_train_padded,
    [y_train_skills, y_train_experience],
    validation_data=(X_val_padded, [y_val_skills, y_val_experience]),
    epochs=50,
    batch_size=32
)

# Save the model after training
model.save('resume_model.keras')


Epoch 1/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 87ms/step - experience_output_accuracy: 0.0000e+00 - experience_output_loss: 6.8165 - loss: 13.6324 - skills_output_accuracy: 0.0000e+00 - skills_output_loss: 6.8159 - val_experience_output_accuracy: 0.0000e+00 - val_experience_output_loss: 6.8121 - val_loss: 13.6202 - val_skills_output_accuracy: 0.0000e+00 - val_skills_output_loss: 6.8082
Epoch 2/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step - experience_output_accuracy: 0.0000e+00 - experience_output_loss: 6.8134 - loss: 13.6275 - skills_output_accuracy: 0.0049 - skills_output_loss: 6.8140 - val_experience_output_accuracy: 0.0000e+00 - val_experience_output_loss: 6.8600 - val_loss: 13.7179 - val_skills_output_accuracy: 0.0000e+00 - val_skills_output_loss: 6.8582
Epoch 3/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 64ms/step - experience_output_accuracy: 0.0047 - experience_output_loss: 6.7931 - loss: 13.5865

#### Testing

#### Testing

In [3]:
import pandas as pd
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from fuzzywuzzy import fuzz

# Load the trained model
model = load_model('resume_model.keras')

# Initialize the Tokenizer and fit it on your training data
tokenizer = Tokenizer()

# Load the original training dataset to fit the tokenizer
# Replace 'path/to/your/training/data.csv' with the actual path
original_training_data = pd.read_csv('cleaned_resume_dataset.csv')
tokenizer.fit_on_texts(original_training_data['pdf_name'])  # Assuming 'pdf_name' contains the text data

# Initialize Label Encoders
label_encoder_skills = LabelEncoder()
label_encoder_experience = LabelEncoder()

# Fit the Label Encoders
# Replace with your actual skill and experience columns
label_encoder_skills.fit(original_training_data['skills'])
label_encoder_experience.fit(original_training_data['experience'])

# Function to preprocess the resume text for the model
def preprocess_for_model(resume_text):
    resume_seq = tokenizer.texts_to_sequences([resume_text])
    padded_seq = pad_sequences(resume_seq, maxlen=100)  # Adjust maxlen based on your model
    # Removed print statement
    return padded_seq

# Function to predict skills and experience using the trained model
def predict_skills_and_experience(resume_text):
    resume_padded = preprocess_for_model(resume_text)
    
    try:
        # Use the model to predict
        skills_pred, experience_pred = model.predict(resume_padded)
    except Exception as e:
        print(f"Error during prediction: {e}")  # Debugging line
        return None, None

    # Get the predicted classes
    predicted_skills = np.argmax(skills_pred, axis=1)[0]
    predicted_experience = np.argmax(experience_pred, axis=1)[0]

    # Decode the predictions to get actual skill and experience values
    decoded_skill = label_encoder_skills.inverse_transform([predicted_skills])[0]
    decoded_experience = label_encoder_experience.inverse_transform([predicted_experience])[0]

    return decoded_skill, decoded_experience

# Function to preprocess job descriptions
def preprocess_text(text):
    return text.lower()  # Basic preprocessing

# Function to match resumes with the provided job description
def match_resumes_with_job_description(job_description, resumes_df):
    job_description_cleaned = preprocess_text(job_description)
    job_description_words = job_description_cleaned.split()
    
    match_results = []
    
    for index, row in resumes_df.iterrows():
        resume_name = row['pdf_name']
        resume_text = f"{row['skills']} {row['experience']}"  # Combine skills and experience
        
        # Predict skills and experience using the model
        predicted_skill, predicted_experience = predict_skills_and_experience(resume_text)
        
        if predicted_skill is None or predicted_experience is None:
            print(f"Skipping {resume_name} due to prediction error.")  # Debugging line
            continue
        
        # Check for exact match for single words
        if len(job_description_words) == 1:
            single_word = job_description_words[0]
            if single_word in predicted_skill.lower() or single_word in predicted_experience.lower():
                match_results.append({
                    "resume": resume_name,
                    "match_percentage": 100,
                })
        
        # Fuzzy matching for multiple words
        else:
            match_percentage = fuzz.token_set_ratio(job_description_cleaned, resume_text)
            match_results.append({
                "resume": resume_name,
                "match_percentage": match_percentage,
            })
    
    # Sort results by match percentage and return top 5
    sorted_results = sorted(match_results, key=lambda x: x["match_percentage"], reverse=True)[:5]
    return sorted_results

# Example usage for testing
job_description = "cyber security"
top_matching_resumes = match_resumes_with_job_description(job_description, data)

# Print results
if top_matching_resumes:
    print("Top matching resumes:")
    for result in top_matching_resumes:
        print(f"{result['resume']}: {result['match_percentage']}% matching skills/experience")
else:
    print("No matching resumes found.")


NameError: name 'data' is not defined