<a href="https://colab.research.google.com/github/montifar/NLP-python-LIb/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:


import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# Download needed resources
nltk.download('punkt')
nltk.download('stopwords')

# Sample Data
data = {
    "text": [
        "Hello, this is an example!",
        "Text preprocessing is VERY important.",
        "Natural language processing with Python."
    ]
}
df = pd.DataFrame(data)

# Initialize Tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    # Tokenize
    tokens = nltk.word_tokenize(text.lower())

    # Remove stopwords + non-alphabetic
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)

# Apply cleaning
df["cleaned_text"] = df["text"].apply(clean_text)

# Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["cleaned_text"])

# Show results
print("Cleaned Text:")
print(df)
print("\nBag of Words (BOW) Feature Names:")
print(vectorizer.get_feature_names_out())
print("\nBOW Matrix:")
print(X.toarray())


Cleaned Text:
                                       text                  cleaned_text
0                Hello, this is an example!                  hello exampl
1     Text preprocessing is VERY important.        text preprocess import
2  Natural language processing with Python.  natur languag process python

Bag of Words (BOW) Feature Names:
['exampl' 'hello' 'import' 'languag' 'natur' 'preprocess' 'process'
 'python' 'text']

BOW Matrix:
[[1 1 0 0 0 0 0 0 0]
 [0 0 1 0 0 1 0 0 1]
 [0 0 0 1 1 0 1 1 0]]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
