## Installing all libraries

In [1]:
!pip install langchain transformers torch torchvision torchaudio
!pip install transformers
!pip install torch
!pip install datasets
!pip install tiktoken
!pip install transformers_stream_generator
!pip install ollama
!pip install huggingface_hub
!pip install bitsandbytes
!pip install -U accelerate

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --up

## Import Libraries

In [1]:
# Import Libraries
import tqdm
import pandas as pd
import transformers
from langchain.llms.base import LLM
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain
from transformers import pipeline
import torch
from langchain.schema import HumanMessage, AIMessage
import json
from typing import List, Union
from pydantic import Field

  from .autonotebook import tqdm as notebook_tqdm


## Check Availability and Connection to GPU

In [2]:
# Is this notebook connected to GPU ?
print(torch.cuda.is_available())

# Is Gpu available in Backend ?
print(torch.backends.cudnn.enabled)

True
True


## Hugging Face API Authentication

In [3]:
# Hugging Face API authentication
!huggingface-cli login --token hf_1234

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `Llama3` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Llama3`


## Intialize your Hugging Face Llama Model

In [48]:
df = pd.read_csv('./data/resume_data.csv')
df.dropna(inplace=True)
df

Unnamed: 0,file_name,resume_content
0,499416_Nishant Dotnet_Resume.pdf,Nishant Shrivastava\nMobile: - +1.469.926.5483...
1,A Madhuri CV.pdf,"MADHURI ANANDASS\nPhone: +91 6300775748, Email..."
2,Aadhila Shabnam Portfolio.pdf,Aadhila Shabnam\n[Chennai] [Phone: +9197890188...
3,aarthy resume.pdf,AARTHY.M\n§ GitHub Profile\n(cid:239) LinkedIn...
4,AbhishekMunda_resume_blue_format__new_.pdf,ABHISHEK MUNDA\nSoftwareEngineer\nabimunda09@g...
...,...,...
144,v.vineela.pdf,VINEELA VELUGOTI\nOBJECTIVE CONTACT\nTo utiliz...
146,Vincent_CV_April_2024_PMO-PO-V1.pdf,"VINCENT SELVAKUMAR M\nPROJECT MANAGEMENT, PLM ..."
147,Vrishali-Patil-Resume.pdf,Vrishali Patil\nCloud Consultant (AWS)\nCloudm...
148,YajatGroverRESUME.pdf,"Yajat Grover\nGurgaon, INDIA · yajatgrover@gma..."


In [None]:
# Create an empty list
structured_data_list = []
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    # The quantization line
     model_kwargs={"torch_dtype": torch.bfloat16, "load_in_4bit": True,"device_map":"auto"},
    max_length=8000
)
for index,rows in df.iterrows():
    resume_text = str(rows['resume_content'])
    prompt = f"""
        You are an expert in HR and reading and parsing resumes
        Resume Content:  
        {resume_text}

        Task:  
        Please extract the following details and present them in JSON format:

        1. **Name**: Candidate's full name.
        2. **Email**: Candidate's email address.
        3. **Phone**: Candidate's phone number.
        4. **Location**: Candidate's location (city/region).
        5. **Skills**: List of skills from the resume (e.g., Python, Machine Learning).
        6. **Experience (Years)**: Calculate the total work experience in years based on the job history in the resume. If specific date ranges (e.g., "2015-2020") are provided, subtract the start year from the end year. If only job titles are given without dates, make an estimated guess based on common job durations.
        7. **Responsibilities**: List of responsibilities or job duties for each role.
        8. **Job Title**: Job titles mentioned in the resume.
        9. **Education**: List of educational qualifications. For each degree, include:
            - **Degree**: The name of the degree (e.g., "Masters of Business Administration (Finance)").
            - **University**: The name of the university or college (e.g., "Bharat PG College for Women").
            - **Year**: The completion year of the degree (e.g., 2018). If not available, leave it as an empty string or null.

        Output the response strictly in this JSON format:
        ```json
        {{
          "Name": "",
          "Email": "",
          "Phone": "",
          "Location": "",
          "Skills": [],
          "Experience (Years)": 0,
          "Responsibilities": [],
          "Job Title": [],
          "Education": [
            {{"Degree": "", "University": "", "Year": ""}}
          ]
        }}
        ```

        If a field is missing in the resume, leave the value empty ("") or an empty list ([]).,
        Dont miss any information present in resume,
        Make sure all Email id's are captured,
        Generate only JSON Object as output no extra instructions
        """
        
    output = pipeline(prompt,return_full_text=False)
    output = output[0]['generated_text']
    start = output.find("{")
    end = output.rfind("}") + 1
    json_string = output[start:end]
    structured_data_list.append(json_string)

# Print the structured data list
print(structured_data_list)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.11it/s]
Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128

In [None]:
# Clean the data by filtering out invalid or empty JSON strings
clean_data = [d for d in structured_data_list if d.strip()]

# Parse valid JSON data
json_data = []
for d in clean_data:
    try:
        json_data.append(json.loads(d))
    except json.JSONDecodeError:
        continue

# Normalize JSON data to DataFrame
df = pd.json_normalize(json_data)

# Show the DataFrame
df

In [None]:
df.to_csv('./metadata/METADATA-FINAL.csv',index=False)