In [1]:
# Import required libraries
import re
import pandas as pd
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [2]:
# Sample dataset with text and labels
data = {
    "text": [
        "Natural Language Processing is AMAZING!",
        "I love learning NLP techniques.",
        "Text cleaning is an important step in NLP.",
        "TF-IDF and Word Embeddings are useful."
    ],
    "label": ["tech", "tech", "education", "tech"]
}

# Create DataFrame
df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,Natural Language Processing is AMAZING!,tech
1,I love learning NLP techniques.,tech
2,Text cleaning is an important step in NLP.,education
3,TF-IDF and Word Embeddings are useful.,tech


In [3]:
# Function for text cleaning
def clean_text(text):
    text = text.lower()                 # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text)     # Remove extra spaces
    return text.strip()

# Apply cleaning
df["clean_text"] = df["text"].apply(clean_text)
df


Unnamed: 0,text,label,clean_text
0,Natural Language Processing is AMAZING!,tech,natural language processing is amazing
1,I love learning NLP techniques.,tech,i love learning nlp techniques
2,Text cleaning is an important step in NLP.,education,text cleaning is an important step in nlp
3,TF-IDF and Word Embeddings are useful.,tech,tfidf and word embeddings are useful


In [4]:
from nltk.tokenize import word_tokenize

# Tokenize cleaned text
df["tokens"] = df["clean_text"].apply(word_tokenize)
df


Unnamed: 0,text,label,clean_text,tokens
0,Natural Language Processing is AMAZING!,tech,natural language processing is amazing,"[natural, language, processing, is, amazing]"
1,I love learning NLP techniques.,tech,i love learning nlp techniques,"[i, love, learning, nlp, techniques]"
2,Text cleaning is an important step in NLP.,education,text cleaning is an important step in nlp,"[text, cleaning, is, an, important, step, in, ..."
3,TF-IDF and Word Embeddings are useful.,tech,tfidf and word embeddings are useful,"[tfidf, and, word, embeddings, are, useful]"


In [5]:
from nltk.corpus import stopwords

# Load English stopwords
stop_words = set(stopwords.words("english"))

# Remove stopwords
df["tokens_no_stopwords"] = df["tokens"].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

df


Unnamed: 0,text,label,clean_text,tokens,tokens_no_stopwords
0,Natural Language Processing is AMAZING!,tech,natural language processing is amazing,"[natural, language, processing, is, amazing]","[natural, language, processing, amazing]"
1,I love learning NLP techniques.,tech,i love learning nlp techniques,"[i, love, learning, nlp, techniques]","[love, learning, nlp, techniques]"
2,Text cleaning is an important step in NLP.,education,text cleaning is an important step in nlp,"[text, cleaning, is, an, important, step, in, ...","[text, cleaning, important, step, nlp]"
3,TF-IDF and Word Embeddings are useful.,tech,tfidf and word embeddings are useful,"[tfidf, and, word, embeddings, are, useful]","[tfidf, word, embeddings, useful]"


In [6]:
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize tokens
df["lemmatized_tokens"] = df["tokens_no_stopwords"].apply(
    lambda tokens: [lemmatizer.lemmatize(word) for word in tokens]
)

df


Unnamed: 0,text,label,clean_text,tokens,tokens_no_stopwords,lemmatized_tokens
0,Natural Language Processing is AMAZING!,tech,natural language processing is amazing,"[natural, language, processing, is, amazing]","[natural, language, processing, amazing]","[natural, language, processing, amazing]"
1,I love learning NLP techniques.,tech,i love learning nlp techniques,"[i, love, learning, nlp, techniques]","[love, learning, nlp, techniques]","[love, learning, nlp, technique]"
2,Text cleaning is an important step in NLP.,education,text cleaning is an important step in nlp,"[text, cleaning, is, an, important, step, in, ...","[text, cleaning, important, step, nlp]","[text, cleaning, important, step, nlp]"
3,TF-IDF and Word Embeddings are useful.,tech,tfidf and word embeddings are useful,"[tfidf, and, word, embeddings, are, useful]","[tfidf, word, embeddings, useful]","[tfidf, word, embeddings, useful]"


In [7]:
# Join tokens into cleaned sentence
df["final_text"] = df["lemmatized_tokens"].apply(lambda x: " ".join(x))
df


Unnamed: 0,text,label,clean_text,tokens,tokens_no_stopwords,lemmatized_tokens,final_text
0,Natural Language Processing is AMAZING!,tech,natural language processing is amazing,"[natural, language, processing, is, amazing]","[natural, language, processing, amazing]","[natural, language, processing, amazing]",natural language processing amazing
1,I love learning NLP techniques.,tech,i love learning nlp techniques,"[i, love, learning, nlp, techniques]","[love, learning, nlp, techniques]","[love, learning, nlp, technique]",love learning nlp technique
2,Text cleaning is an important step in NLP.,education,text cleaning is an important step in nlp,"[text, cleaning, is, an, important, step, in, ...","[text, cleaning, important, step, nlp]","[text, cleaning, important, step, nlp]",text cleaning important step nlp
3,TF-IDF and Word Embeddings are useful.,tech,tfidf and word embeddings are useful,"[tfidf, and, word, embeddings, are, useful]","[tfidf, word, embeddings, useful]","[tfidf, word, embeddings, useful]",tfidf word embeddings useful


In [8]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
le = LabelEncoder()

# Encode labels
df["label_encoded"] = le.fit_transform(df["label"])
df


Unnamed: 0,text,label,clean_text,tokens,tokens_no_stopwords,lemmatized_tokens,final_text,label_encoded
0,Natural Language Processing is AMAZING!,tech,natural language processing is amazing,"[natural, language, processing, is, amazing]","[natural, language, processing, amazing]","[natural, language, processing, amazing]",natural language processing amazing,1
1,I love learning NLP techniques.,tech,i love learning nlp techniques,"[i, love, learning, nlp, techniques]","[love, learning, nlp, techniques]","[love, learning, nlp, technique]",love learning nlp technique,1
2,Text cleaning is an important step in NLP.,education,text cleaning is an important step in nlp,"[text, cleaning, is, an, important, step, in, ...","[text, cleaning, important, step, nlp]","[text, cleaning, important, step, nlp]",text cleaning important step nlp,0
3,TF-IDF and Word Embeddings are useful.,tech,tfidf and word embeddings are useful,"[tfidf, and, word, embeddings, are, useful]","[tfidf, word, embeddings, useful]","[tfidf, word, embeddings, useful]",tfidf word embeddings useful,1


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer()

# Create TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df["final_text"])

# Convert to DataFrame
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df


Unnamed: 0,amazing,cleaning,embeddings,important,language,learning,love,natural,nlp,processing,step,technique,text,tfidf,useful,word
0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.525473,0.525473,0.0,0.414289,0.0,0.0,0.525473,0.0,0.0,0.0,0.0
2,0.0,0.465162,0.0,0.465162,0.0,0.0,0.0,0.0,0.366739,0.0,0.465162,0.0,0.465162,0.0,0.0,0.0
3,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5


In [10]:
# Save cleaned dataset
df.to_csv("cleaned_text_data.csv", index=False)

# Save TF-IDF features
tfidf_df.to_csv("tfidf_features.csv", index=False)

print("Files saved successfully!")


Files saved successfully!
