<a href="https://colab.research.google.com/github/loki20051267/NLP/blob/main/aug8th.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd


data = {
    "resume_text": [
        "John Doe\n• Skills: Python, Java, SQL • Experience: 2 years",
        "Jane Smith\n• Expert in Machine Learning, Data Analysis, C++",
        "Mark Lee\n• Worked on AI, Deep Learning & NLP projects\nExperience: 3 years"
    ]
}


df = pd.DataFrame(data)


print("First 3 Rows of Resumes:")
print(df.head(3))


print("\nChecking for noisy characters like \\n, •, symbols:")
for i, text in enumerate(df["resume_text"]):
    print(f"Resume {i+1}: {text}")


First 3 Rows of Resumes:
                                         resume_text
0  John Doe\n• Skills: Python, Java, SQL • Experi...
1  Jane Smith\n• Expert in Machine Learning, Data...
2  Mark Lee\n• Worked on AI, Deep Learning & NLP ...

Checking for noisy characters like \n, •, symbols:
Resume 1: John Doe
• Skills: Python, Java, SQL • Experience: 2 years
Resume 2: Jane Smith
• Expert in Machine Learning, Data Analysis, C++
Resume 3: Mark Lee
• Worked on AI, Deep Learning & NLP projects
Experience: 3 years


# **NLTK  Preprocessing**

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("punkt_tab") # Download the punkt_tab resource


stop_words = set(stopwords.words("english"))
ps = PorterStemmer()

cleaned_texts = []

for text in df["resume_text"]:
    # 1. Remove special chars & digits
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # 2. Tokenize
    tokens = nltk.word_tokenize(text.lower())

    # 3. Remove stopwords
    tokens = [w for w in tokens if w not in stop_words]

    # 4. Stemming
    stemmed = [ps.stem(w) for w in tokens]

    cleaned_texts.extend(stemmed)

# Top 10 frequent words
freq_words = Counter(cleaned_texts).most_common(10)
print("\nTop 10 frequent stemmed words:")
print(freq_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Top 10 frequent stemmed words:
[('experi', 2), ('year', 2), ('learn', 2), ('john', 1), ('doe', 1), ('skill', 1), ('python', 1), ('java', 1), ('sql', 1), ('jane', 1)]


# **spaCy**

In [4]:
import spacy
from collections import Counter

# Load English model
nlp = spacy.load("en_core_web_sm")

lemmas = []

for text in df["resume_text"]:
    doc = nlp(text)
    for token in doc:
        if token.is_alpha and (token.pos_ in ["NOUN", "VERB"]):
            lemmas.append(token.lemma_.lower())

# Top 10 frequent lemmas
lemma_freq = Counter(lemmas).most_common(10)
print("\nTop 10 frequent lemmas (spaCy):")
print(lemma_freq)



Top 10 frequent lemmas (spaCy):
[('experience', 2), ('year', 2), ('expert', 1), ('work', 1), ('project', 1)]
