# **Text Preprocessing**

## **Import Libraries**

In [161]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pickle
import pandas as pd

In [162]:
import os

# custom path (make sure folder exists)
nltk_data_path = os.path.join(os.getcwd(), "nltk_data")

# add this path to NLTK
nltk.data.path.append(nltk_data_path)

# download packages into this folder
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('punkt_tab', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('omw-1.4', download_dir=nltk_data_path)

[nltk_data] Downloading package punkt to d:\MARIAM2\DATA SCIENCE
[nltk_data]     BOOTCAMP PROJECTS\Resume Screening App - ML-NLP-
[nltk_data]     Streamlit\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to d:\MARIAM2\DATA SCIENCE
[nltk_data]     BOOTCAMP PROJECTS\Resume Screening App - ML-NLP-
[nltk_data]     Streamlit\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to d:\MARIAM2\DATA SCIENCE
[nltk_data]     BOOTCAMP PROJECTS\Resume Screening App - ML-NLP-
[nltk_data]     Streamlit\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to d:\MARIAM2\DATA SCIENCE
[nltk_data]     BOOTCAMP PROJECTS\Resume Screening App - ML-NLP-
[nltk_data]     Streamlit\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to d:\MARIAM2\DATA SCIENCE
[nltk_data]     BOOTCAMP PROJECTS

True

In [163]:
# Load the Pickle file
with open("src/resumes_df.pkl", "rb") as f:
    resumes_df = pickle.load(f)

print("Data Loaded!")
print(resumes_df.shape)
print(resumes_df.columns)

Data Loaded!
(962, 3)
Index(['Category', 'Resume', 'text_length'], dtype='object')


## **Step 1: Lowercasing**

In [164]:
# lowercasing ensures uniformity

resumes_df['Resume'] = resumes_df['Resume'].str.lower()
resumes_df['Resume'].head()

0    skills * programming languages: python (pandas...
1    education details \r\nmay 2013 to may 2017 b.e...
2    areas of interest deep learning, control syste...
3    skills â¢ r â¢ python â¢ sap hana â¢ table...
4    education details \r\n mca   ymcaust,  faridab...
Name: Resume, dtype: object

## **Step 2: Remove Emails, URLs, Special Characters & Extra Spaces**

In [165]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove Emails
    text = re.sub(r'\S+@\S+', '', text)
    # Remove unwanted special characters (keep +, #, ., -, /)
    text = re.sub(r'[^a-zA-Z0-9\s\+\#\.\-/]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

resumes_df['Resume'] = resumes_df['Resume'].apply(clean_text)
resumes_df['Resume'].head()

0    skills programming languages python pandas num...
1    education details may 2013 to may 2017 b.e uit...
2    areas of interest deep learning control system...
3    skills r python sap hana tableau sap hana sql ...
4    education details mca ymcaust faridabad haryan...
Name: Resume, dtype: object

## **Step 3: Preserve Short Skills (c, r, c#)**

In [166]:
skills_to_keep = {
    # Programming languages (short forms & symbols)
    'c', 'c#', 'c++', 'r', 'go', 'js', 'ts', 'vb',

    # Databases
    'db', 'sql', 'pl', 'ql', 'db2',

    # Data Science / Machine Learning / AI
    'ml', 'ai', 'nlp', 'cv', 'rl',

    # Web & DevOps
    'ui', 'ux', 'ci', 'cd',

    # Cloud & DevOps tools
    'aws', 'gcp', 'az', 'vm',

    # Testing & Automation
    'qa', 'qc',

    # Miscellaneous
    'bi', 'it', 'qa', 'sa', 'qa',
}

def filter_short_words(text, skills_to_keep):
    tokens = text.split()
    filtered = [t for t in tokens if len(t) > 2 or t.lower() in skills_to_keep]
    return ' '.join(filtered)

resumes_df['Resume'] = resumes_df['Resume'].apply(
    lambda x: filter_short_words(x, skills_to_keep)
)

resumes_df['Resume'].head()

0    skills programming languages python pandas num...
1    education details may 2013 may 2017 b.e uit-rg...
2    areas interest deep learning control system de...
3    skills r python sap hana tableau sap hana sql ...
4    education details mca ymcaust faridabad haryan...
Name: Resume, dtype: object

## **Step 4: Tokenization, Stopword Removal & Lemmatization**

In [167]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_tokens(text):
    # Tokenization using nltk
    tokens = word_tokenize(text)  
    # Remove stopwords & lemmatize
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t.lower() not in stop_words]
    # Join back into sentence
    return ' '.join(tokens)

resumes_df['final_resume'] = resumes_df['Resume'].apply(preprocess_tokens)
resumes_df[['Resume', 'final_resume']].head()

Unnamed: 0,Resume,final_resume
0,skills programming languages python pandas num...,skill programming language python panda numpy ...
1,education details may 2013 may 2017 b.e uit-rg...,education detail may 2013 may 2017 b.e uit-rgp...
2,areas interest deep learning control system de...,area interest deep learning control system des...
3,skills r python sap hana tableau sap hana sql ...,skill r python sap hana tableau sap hana sql s...
4,education details mca ymcaust faridabad haryan...,education detail mca ymcaust faridabad haryana...
