In [4]:
# Sample documents (sentences)
sentences = [
    "Hello, how are you?",
    "Hi there!",
    "Good morning",
    "Can you help me with my order?",
    "I need support for my account",
    "Please reset my password",
    "This app keeps crashing",
    "I am unhappy with the service",
    "The delivery was very late",
    "Hey!",
    "Could you tell me the price?",
    "Why is this so slow?"
]

# Corresponding labels (string format)
labels = [
    "greeting",   # Hello, how are you?
    "greeting",   # Hi there!
    "greeting",   # Good morning
    "request",    # Can you help me with my order?
    "request",    # I need support for my account
    "request",    # Please reset my password
    "complaint",  # This app keeps crashing
    "complaint",  # I am unhappy with the service
    "complaint",  # The delivery was very late
    "greeting",   # Hey!
    "request",    # Could you tell me the price?
    "complaint"   # Why is this so slow?
]


In [6]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

cleaned_docs = [clean_text(doc) for doc in sentences]
print("Cleaned Text:")
print(cleaned_docs)

Cleaned Text:
['hello how are you', 'hi there', 'good morning', 'can you help me with my order', 'i need support for my account', 'please reset my password', 'this app keeps crashing', 'i am unhappy with the service', 'the delivery was very late', 'hey', 'could you tell me the price', 'why is this so slow']


In [8]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

lemmatized_docs = []
for doc in cleaned_docs:
    tokens = word_tokenize(doc)
    lemmas = [lemmatizer.lemmatize(word) for word in tokens]
    lemmatized_docs.append(" ".join(lemmas))

print("\nLemmatized Text:")
print(lemmatized_docs)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Lemmatized Text:
['hello how are you', 'hi there', 'good morning', 'can you help me with my order', 'i need support for my account', 'please reset my password', 'this app keep crashing', 'i am unhappy with the service', 'the delivery wa very late', 'hey', 'could you tell me the price', 'why is this so slow']


In [9]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

final_docs = []
for doc in lemmatized_docs:
    tokens = doc.split()
    filtered = [word for word in tokens if word not in stop_words]
    final_docs.append(" ".join(filtered))

print("\nAfter Stop-word Removal:")
print(final_docs)


After Stop-word Removal:
['hello', 'hi', 'good morning', 'help order', 'need support account', 'please reset password', 'app keep crashing', 'unhappy service', 'delivery wa late', 'hey', 'could tell price', 'slow']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

print("\nEncoded Labels:")
print(encoded_labels)


Encoded Labels:
[1 1 1 2 2 2 0 0 0 1 2 0]


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(final_docs)

print("\nTF-IDF Vocabulary:")
print(tfidf.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


TF-IDF Vocabulary:
['account' 'app' 'could' 'crashing' 'delivery' 'good' 'hello' 'help' 'hey'
 'hi' 'keep' 'late' 'morning' 'need' 'order' 'password' 'please' 'price'
 'reset' 'service' 'slow' 'support' 'tell' 'unhappy' 'wa']

TF-IDF Matrix:
[[0.         0.         0.         0.         0.         0.
  1.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         1.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.70710678
  0.         0.         0.         0.         0.         0.
  0.70710678 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
 

In [12]:
import pandas as pd

# Save cleaned text
df_text = pd.DataFrame({
    "original_text": sentences,
    "processed_text": final_docs,
    "label": encoded_labels
})
df_text.to_csv("processed_text.csv", index=False)

# Save TF-IDF matrix
df_tfidf = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)
df_tfidf.to_csv("tfidf_vectors.csv", index=False)

print("\nFiles saved successfully!")


Files saved successfully!
