In [None]:
!pip install python-docx

In [None]:
# Import Libraries
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter
import docx
import spacy
import os
from google.colab import drive

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Load spaCy English model for parsing
nlp = spacy.load('en_core_web_sm')

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load Word Document
def load_docx(file_path):
    doc = docx.Document(file_path)
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)
    return '\n'.join(text)

In [None]:
# Update the path with your document location in Google Drive
file_path = '/content/drive/My Drive/Colab Notebooks/nlp/sample.docx'
document_text = load_docx(file_path)

In [None]:
print(document_text[:200])

China's overqualified youth taking jobs as drivers, labourers and film extras.

China is now a country where a high-school handyman has a master's degree in physics; a cleaner is qualified in environm


In [None]:
# Segmentation
sentences = sent_tokenize(document_text)

In [None]:
print(sentences[:5])

["China's overqualified youth taking jobs as drivers, labourers and film extras.", "China is now a country where a high-school handyman has a master's degree in physics; a cleaner is qualified in environmental planning; a delivery driver studied philosophy, and a PhD graduate from the prestigious Tsinghua University ends up applying to work as an auxiliary police officer.", 'These are real cases in a struggling economy - and it is not hard to find more like them.', '"My dream job was to work in investment banking," says Sun Zhan as he prepares to start his shift as a waiter in a hot pot restaurant in the southern city of Nanjing.', "The 25-year-old recently graduated with a master's degree in finance."]


In [None]:
# Tokenization
# words = word_tokenize(document_text)
words = []
for sentence in sentences:
    words.extend(word_tokenize(sentence))

In [None]:
print(words[:10])

['China', "'s", 'overqualified', 'youth', 'taking', 'jobs', 'as', 'drivers', ',', 'labourers']


In [None]:
# Remove Stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]

In [None]:
print(filtered_words[:10])

['China', 'overqualified', 'youth', 'taking', 'jobs', 'drivers', 'labourers', 'film', 'extras', 'China']


In [None]:
#Stemming
#stemmer = PorterStemmer()
#stemmed_words = [stemmer.stem(word) for word in filtered_words]

In [None]:
#print(stemmed_words[:10])

In [None]:
# #Lemmatization
lemmatizer = WordNetLemmatizer()
# lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
nltk.download('averaged_perceptron_tagger_eng')
# Function to get POS tag for lemmatization
def get_wordnet_pos(word):
    from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
    tag = nltk.pos_tag([word])[0][1][0].upper()
    return {
        'J': ADJ,  # Adjective
        'V': VERB,  # Verb
        'N': NOUN,  # Noun
        'R': ADV,   # Adverb
    }.get(tag, NOUN)  # Default to noun

lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in filtered_words]


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [None]:
print(lemmatized_words[:10])

['China', 'overqualified', 'youth', 'take', 'job', 'driver', 'labourer', 'film', 'extra', 'China']


In [None]:
# Parsing for Syntactic Analysis
def parse_sentences(sentences):
    parsed_data = []
    for sentence in sentences:
        doc = nlp(sentence)
        parsed_sentence = [
            {
                "text": token.text,
                "lemma": token.lemma_,
                "pos": token.pos_,
                "tag": token.tag_,
                "dependency": token.dep_,
            }
            for token in doc
        ]
        parsed_data.append(parsed_sentence)
    return parsed_data

parsed_sentences = parse_sentences(sentences)


In [None]:
# Display Parsing Results for the First Sentence
print("Parsed Data for the First Sentence:")
for token in parsed_sentences[0]:
    print(token)

Parsed Data for the First Sentence:
{'text': 'China', 'lemma': 'China', 'pos': 'PROPN', 'tag': 'NNP', 'dependency': 'poss'}
{'text': "'s", 'lemma': "'s", 'pos': 'PART', 'tag': 'POS', 'dependency': 'case'}
{'text': 'overqualified', 'lemma': 'overqualifie', 'pos': 'VERB', 'tag': 'VBN', 'dependency': 'amod'}
{'text': 'youth', 'lemma': 'youth', 'pos': 'NOUN', 'tag': 'NN', 'dependency': 'nsubj'}
{'text': 'taking', 'lemma': 'take', 'pos': 'VERB', 'tag': 'VBG', 'dependency': 'ROOT'}
{'text': 'jobs', 'lemma': 'job', 'pos': 'NOUN', 'tag': 'NNS', 'dependency': 'dobj'}
{'text': 'as', 'lemma': 'as', 'pos': 'ADP', 'tag': 'IN', 'dependency': 'prep'}
{'text': 'drivers', 'lemma': 'driver', 'pos': 'NOUN', 'tag': 'NNS', 'dependency': 'pobj'}
{'text': ',', 'lemma': ',', 'pos': 'PUNCT', 'tag': ',', 'dependency': 'punct'}
{'text': 'labourers', 'lemma': 'labourer', 'pos': 'NOUN', 'tag': 'NNS', 'dependency': 'conj'}
{'text': 'and', 'lemma': 'and', 'pos': 'CCONJ', 'tag': 'CC', 'dependency': 'cc'}
{'text': 'fi

In [None]:
#Count Word Frequencies (Using Lemmatized Words)
word_freq = Counter(lemmatized_words)

In [None]:
# Display Top Keywords
top_keywords = word_freq.most_common(10)
print("\nTop Keywords and Frequencies:")
for keyword, freq in top_keywords:
    print(f"{keyword}: {freq}")


Top Keywords and Frequencies:
job: 5
graduate: 4
China: 3
work: 3
driver: 2
master: 2
degree: 2
qualify: 2
study: 2
real: 2
