In [1]:
import pandas as pd

# In the next line, we'll create a dictionary called data.
# A dictionary in Python stores data as key–value pairs.
# The first key, "doc_id", represents a unique identifier for each document or text entry.
# These IDs help us keep track of individual text records.
# The second key, "text", contains a list of sentences.
# These sentences are examples of text data, which is what we typically work with in Natural Language Processing (NLP).
# Notice that each sentence is written as a string and placed inside a list.
# Each sentence can be thought of as a separate document.
# We then convert this dictionary into a pandas DataFrame using pd.DataFrame(data).

data = {
    "doc_id": [1, 2, 3, 4, 5],
    "text": [
        "I love learning Python for data analytics.",
        "Natural language processing helps computers understand humans.",
        "Chatbots are used in customer support.",
        "Text data is messy but very powerful.",
        "NLP is an exciting field of artificial intelligence."
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,doc_id,text
0,1,I love learning Python for data analytics.
1,2,Natural language processing helps computers un...
2,3,Chatbots are used in customer support.
3,4,Text data is messy but very powerful.
4,5,NLP is an exciting field of artificial intelli...


In [2]:
# Next: Tokenization
# Tokenization is a foundational step in NLP.
# Almost all text analysis tasks—such as word frequency, sentiment analysis, or text classification—start with tokenizing text into sentences or words.
# At this point, we are still preparing the data, not analyzing meaning yet.
# This step helps transform unstructured text into a form that computers can work with.

# In the following step, we are introducing NLTK, which stands for Natural Language Toolkit.
# It is a popular Python library used for working with human language data.
# We import two specific functions from NLTK:
# sent_tokenize() for breaking text into sentences
# word_tokenize() for breaking text into individual words

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Next, we run nltk.download("punkt").
# This downloads a pretrained tokenizer model that NLTK uses to correctly identify sentence boundaries and words, including punctuation.

# We also download punkt_tab which provides supporting lookup tables (metadata) used by the tokenizer. Newer versions of NLTK sometimes require punkt_tab explicitly

nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MuriloFarias\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\MuriloFarias\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [3]:

#We then combine all the text from the DataFrame into one single string using join function.

text = " ".join(df["text"])

In [4]:
text

'I love learning Python for data analytics. Natural language processing helps computers understand humans. Chatbots are used in customer support. Text data is messy but very powerful. NLP is an exciting field of artificial intelligence.'

In [5]:

# This joins all rows of the text column together, separated by spaces, so we can process the entire corpus at once.
# The variable sentences stores the output of sent_tokenize(text).
# This function automatically detects where sentences begin and end, even when punctuation is involved.

sentences = sent_tokenize(text)

In [6]:
print(sentences)

['I love learning Python for data analytics.', 'Natural language processing helps computers understand humans.', 'Chatbots are used in customer support.', 'Text data is messy but very powerful.', 'NLP is an exciting field of artificial intelligence.']


In [7]:


#The variable words stores the output of word_tokenize(text).
#This splits the text into individual tokens, which include words and punctuation marks.

words = word_tokenize(text)

# When we print sentences, we see a list where each element is a complete sentence.
# When we print words[:30], we display only the first 30 tokens.


print(words[:30])

['I', 'love', 'learning', 'Python', 'for', 'data', 'analytics', '.', 'Natural', 'language', 'processing', 'helps', 'computers', 'understand', 'humans', '.', 'Chatbots', 'are', 'used', 'in', 'customer', 'support', '.', 'Text', 'data', 'is', 'messy', 'but', 'very', 'powerful']


In [10]:
# Next: Removing Stop Words and Cleaning Text

# Removing stop words is an important preprocessing step in NLP because it:
# Reduces noise in the data
# Improves efficiency
# Helps models focus on informative words
# After this step, the text is ready for further analysis

# At this stage, we are cleaning our tokenized text so that it becomes more useful for analysis.
# We start by importing stopwords from the nltk.corpus module.
# Stop words are very common words in a language that usually do not add much meaning to text analysis.
# Examples of stop words include words such as the, is, and, to, in, of.
# These words appear very frequently but don’t help us understand what the text is about.

from nltk.corpus import stopwords

In [11]:
# This downloads a predefined list of English stop words provided by NLTK.

nltk.download("stopwords") #Downloads the stop-words dataset to disk (your computer / Colab environment)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MuriloFarias\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [14]:
stop_words2 = stopwords.words("english") # this is a list

In [15]:
stop_words2

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [12]:

#stop_words = stopwords.words("english") # this is a list

# We then create a variable called stop_words:

stop_words = set(stopwords.words("english")) # this is a set

In [13]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [16]:


# We convert the list of stop words into a set because sets allow much faster lookups when checking whether a word should be removed.
# Now we create a new list called clean_words using a list comprehension.
# This single block of code performs multiple cleaning steps at once:
# w.lower() converts each word to lowercase, so that words like Python and python are treated the same.
# w.isalpha() keeps only alphabetic tokens, removing numbers and punctuation.
# w.lower() not in stop_words removes common stop words from the text.
# As a result, clean_words contains only:
# Lowercase words
# Alphabet-only tokens
# Words that carry more meaningful information.

clean_words = [
    w.lower() for w in words
    if w.isalpha() and w.lower() not in stop_words
]




In [17]:
# This shows the first 30 cleaned words, which helps us quickly inspect whether the cleaning process worked as expected without printing the entire list.

clean_words[:30]

['love',
 'learning',
 'python',
 'data',
 'analytics',
 'natural',
 'language',
 'processing',
 'helps',
 'computers',
 'understand',
 'humans',
 'chatbots',
 'used',
 'customer',
 'support',
 'text',
 'data',
 'messy',
 'powerful',
 'nlp',
 'exciting',
 'field',
 'artificial',
 'intelligence']

In [19]:
#Python Recap from previous semesters: List is ordered and allow duplicates. Sets are unordered and remove duplicates
my_list = ["python", "python", "data"]
display(my_list)

my_set = {"python", "python", "data"}
display(my_set)

['python', 'python', 'data']

{'data', 'python'}

In [20]:
# Next: Lemmatization using WordNet
# Lemmatization converts words into their base or dictionary form, called a lemma.
# For example:
# running → run
# cars → car
# better → good (with additional context)

# Lemmatization is useful because it:
# Reduces word variation
# Improves consistency
# Helps models treat related words as the same feature

# We begin by importing WordNetLemmatizer from nltk.stem.
# WordNet is a large lexical database of English that helps NLTK understand word meanings and relationships.


from nltk.stem import WordNetLemmatizer

In [21]:

# Next, we download the required resources:
#wordnet provides the dictionary used for lemmatization
#omw-1.4 (Open Multilingual WordNet) supports word mappings and improves coverage

nltk.download("wordnet")
nltk.download("omw-1.4")

#We then create a lemmatizer object. This object will be used to convert words into their base forms.

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MuriloFarias\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MuriloFarias\AppData\Roaming\nltk_data...


In [22]:

# Using a list comprehension, we apply lemmatization to each word in clean_words. At this point:
# The words are already lowercase
# Stop words and punctuation have been removed
# Lemmatization further standardizes the text.

lemmatized_words = [lemmatizer.lemmatize(w) for w in clean_words]

# Inspect the first 30 lemmatized words and confirm that the transformation worked as expected.

lemmatized_words[:30]


['love',
 'learning',
 'python',
 'data',
 'analytics',
 'natural',
 'language',
 'processing',
 'help',
 'computer',
 'understand',
 'human',
 'chatbots',
 'used',
 'customer',
 'support',
 'text',
 'data',
 'messy',
 'powerful',
 'nlp',
 'exciting',
 'field',
 'artificial',
 'intelligence']

In [23]:
# Next: Word Frequency Analysis using NLTK

# In this step, we analyze our cleaned and lemmatized text by looking at word frequencies.
# We start by importing FreqDist from nltk.probability.
# A frequency distribution simply counts how often each word appears in the text.


from nltk.probability import FreqDist

In [24]:


#Next, we create a frequency distribution object. This goes through the list of lemmatized words and counts each unique word.

freq_dist = FreqDist(lemmatized_words)

# The following line returns the top 10 most frequent words along with their counts as (word, frequency).

freq_dist.most_common(10)


[('data', 2),
 ('love', 1),
 ('learning', 1),
 ('python', 1),
 ('analytics', 1),
 ('natural', 1),
 ('language', 1),
 ('processing', 1),
 ('help', 1),
 ('computer', 1)]

In [None]:
#I will not use this part below because I dont wanna save the file

# Next: Converting Word Frequencies into a DataFrame and Saving to a File

# freq_dist.most_common() returns a list of tuples in the form (word, frequency)
# We convert this list into a DataFrame with two columns:
# word → the text token
# count → how many times the word appears
# This transformation is important because most data analysis tools work with tables, not raw Python objects.
# Next, we save the DataFrame to a CSV file

#freq_df = pd.DataFrame(freq_dist.most_common(), columns=["word", "count"])
#freq_df.to_csv("word_frequencies.csv", index=False)

#freq_df.head() # shows the first five rows of the DataFrame


Unnamed: 0,word,count
0,data,2
1,love,1
2,learning,1
3,python,1
4,analytics,1
