In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample Data
data = pd.DataFrame({
    'text': ["Text cleaning is an important step in natural language processing!",
             "Lemmatization reduces words to their base form, e.g., 'running' to 'run'.",
             "Stop words like 'and', 'the', 'is' are often removed before text analysis.",
             "Label encoding converts categorical labels into numerical values.",
             "TF-IDF is a widely used technique to represent text data numerically."],
    'label': ['A', 'B', 'C', 'D', 'E']
})

# Text Cleaning
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters, numbers
    text = text.lower()  # Convert text to lowercase
    return text

data['cleaned_text'] = data['text'].apply(clean_text)

# Lemmatization
nltk.download('punkt')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data['lemmatized_text'] = data['cleaned_text'].apply(lemmatize_text)

# Removing Stop Words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

data['stopwords_removed'] = data['lemmatized_text'].apply(remove_stopwords)

# Label Encoding
label_encoder = LabelEncoder()
data['encoded_label'] = label_encoder.fit_transform(data['label'])

# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_representation = tfidf_vectorizer.fit_transform(data['stopwords_removed'])

# Combine dataframes
tfidf_df = pd.DataFrame(tfidf_representation.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
label_mapping = pd.DataFrame({'label': label_encoder.classes_, 'encoded_label': label_encoder.transform(label_encoder.classes_)})

# Concatenate dataframes
final_data = pd.concat([data, tfidf_df, label_mapping], axis=1)

# Save the combined dataframe to a CSV file
final_data.to_csv('processed_data.csv', index=False)





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
