**ASS NO-3 Perform text cleaning, perform lemmatization (any method), remove stop words (any method), label encoding. Create representations using TF-IDF. Save outputs**

In [None]:
!pip install nltk scikit-learn pandas



In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')  # Added to resolve the LookupError
nltk.download('stopwords')
nltk.download('wordnet')

print('Libraries imported and NLTK resources downloaded.')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Libraries imported and NLTK resources downloaded.


In [None]:
# Create a sample dataset
data = {
    'text': [
        "This is the first document. It's amazing!",
        "This document is the second document.",
        "And this is the third one? Yes, indeed!",
        "Is this the first document? Absolutely."
    ],
    'label': ['A', 'B', 'A', 'B']
}

df = pd.DataFrame(data)
print('Original Data:')
print(df)

# Function for text cleaning: lowercasing, removing digits & punctuation, and trimming spaces
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)
print('\nCleaned Text:')
print(df[['text', 'clean_text']])

Original Data:
                                        text label
0  This is the first document. It's amazing!     A
1      This document is the second document.     B
2    And this is the third one? Yes, indeed!     A
3    Is this the first document? Absolutely.     B

Cleaned Text:
                                        text  \
0  This is the first document. It's amazing!   
1      This document is the second document.   
2    And this is the third one? Yes, indeed!   
3    Is this the first document? Absolutely.   

                               clean_text  
0  this is the first document its amazing  
1    this document is the second document  
2    and this is the third one yes indeed  
3   is this the first document absolutely  


In [None]:
# Initialize the lemmatizer and define stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for lemmatization with stop word removal
def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

df['lemmatized'] = df['clean_text'].apply(lemmatize_text)
print('\nLemmatized Text (stop words removed):')
print(df[['clean_text', 'lemmatized']])


Lemmatized Text (stop words removed):
                               clean_text                 lemmatized
0  this is the first document its amazing     first document amazing
1    this document is the second document   document second document
2    and this is the third one yes indeed       third one yes indeed
3   is this the first document absolutely  first document absolutely


In [None]:
# Label Encoding for the labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

print('\nLabel Encoded Data:')
print(df[['label', 'label_encoded']])

# Create TF-IDF representations for the lemmatized text
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['lemmatized'])

print('\nTF-IDF Representation Shape:')
print(tfidf_matrix.shape)

# Save outputs to disk
df.to_csv('processed_text.csv', index=False)
with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

print('\nOutputs saved: processed_text.csv and tfidf_matrix.pkl')


Label Encoded Data:
  label  label_encoded
0     A              0
1     B              1
2     A              0
3     B              1

TF-IDF Representation Shape:
(4, 9)

Outputs saved: processed_text.csv and tfidf_matrix.pkl
