In [None]:
# Perform text cleaning, lemmatization, stop-word removal, label encoding
# Create TF-IDF representations and save outputs


In [None]:
import re
import nltk
import pandas as pd
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# Sample dataset
data = {
    'text': [
        "Artificial intelligence is transforming healthcare",
        "Machine learning enables intelligent systems",
        "Natural language processing is a core AI field"
    ],
    'label': ['technology', 'technology', 'technology']
}

df = pd.DataFrame(data)

In [None]:
# Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_text'] = df['cleaned_text'].apply(
    lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
)

In [None]:
# Stop-word Removal
stop_words = set(stopwords.words('english'))
df['processed_text'] = df['lemmatized_text'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

In [None]:
# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("TF-IDF Matrix:")
print(tfidf_df)

In [None]:
# Save Outputs
df.to_csv('processed_text_data.csv', index=False)

with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print('All outputs saved successfully')