In [4]:
# NLP Preprocessing - Punkt-free version

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
import nltk

# Download stopwords
nltk.download('stopwords')

# Sample text dataset
data = pd.DataFrame({
    'text': [
        "I love machine learning and AI.",
        "Natural Language Processing is fun!",
        "Deep learning models are powerful.",
        "Stopword removal and tokenization help NLP."
    ]
})

# Preprocessing function
stop_words = set(stopwords.words('english'))
tokenizer = ToktokTokenizer()  # Punkt-free tokenizer

def preprocess(text):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess)
print(data)

# TF-IDF vectorization
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(data['cleaned_text'])
print("TF-IDF shape:", X_tfidf.shape)



                                          text  \
0              I love machine learning and AI.   
1          Natural Language Processing is fun!   
2           Deep learning models are powerful.   
3  Stopword removal and tokenization help NLP.   

                             cleaned_text  
0                love machine learning ai  
1         natural language processing fun  
2           deep learning models powerful  
3  stopword removal tokenization help nlp  
TF-IDF shape: (4, 16)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
