In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/spam_ham_dataset.csv", encoding='latin-1')
print(df.columns)

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
df.shape

(5171, 4)

In [None]:

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt_tab')  # Make sure punkt is downloaded

# Assuming you have a DataFrame `df` with a column `text`
df['string_tokens'] = df['text'].apply(lambda x: x.split())  # Simple space-based tokenization
df['nltk_word_tokens'] = df['text'].apply(lambda x: word_tokenize(str(x)))  # Word tokenization using NLTK
df['nltk_sentence_tokens'] = df['text'].apply(lambda x: sent_tokenize(str(x)))  # Sentence tokenization using NLTK

# Display the results
print(df[['text', 'string_tokens', 'nltk_word_tokens', 'nltk_sentence_tokens']].head())


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                                text  \
0  Subject: enron methanol ; meter # : 988291\r\n...   
1  Subject: hpl nom for january 9 , 2001\r\n( see...   
2  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3  Subject: photoshop , windows , office . cheap ...   
4  Subject: re : indian springs\r\nthis deal is t...   

                                       string_tokens  \
0  [Subject:, enron, methanol, ;, meter, #, :, 98...   
1  [Subject:, hpl, nom, for, january, 9, ,, 2001,...   
2  [Subject:, neon, retreat, ho, ho, ho, ,, we, '...   
3  [Subject:, photoshop, ,, windows, ,, office, ....   
4  [Subject:, re, :, indian, springs, this, deal,...   

                                    nltk_word_tokens  \
0  [Subject, :, enron, methanol, ;, meter, #, :, ...   
1  [Subject, :, hpl, nom, for, january, 9, ,, 200...   
2  [Subject, :, neon, retreat, ho, ho, ho, ,, we,...   
3  [Subject, :, photoshop, ,, windows, ,, office,...   
4  [Subject, :, re, :, indian, springs, this, 

In [None]:
from collections import Counter

# Flatten the list of word tokens
all_word_tokens = [token for sublist in df['nltk_word_tokens'] for token in sublist]

# Get the most frequent word tokens
word_token_counts = Counter(all_word_tokens)
most_common_words = word_token_counts.most_common(10) # Get top 10 most common words

print("Most frequent word tokens:")
for word, count in most_common_words:
    print(f"{word}: {count}")

print("\n---\n")

# Flatten the list of sentence tokens
all_sentence_tokens = [token for sublist in df['nltk_sentence_tokens'] for token in sublist]

# Get the most frequent sentence tokens
sentence_token_counts = Counter(all_sentence_tokens)
most_common_sentences = sentence_token_counts.most_common(10) # Get top 10 most common sentences

print("Most frequent sentence tokens:")
for sentence, count in most_common_sentences:
    print(f"{sentence}: {count}")

Most frequent word tokens:
-: 85723
.: 54681
/: 42848
,: 40640
:: 30275
the: 25613
to: 20332
ect: 13900
and: 12815
@: 12735

---

Most frequent sentence tokens:
.: 4439
?: 2111
!: 817
xls: 482
thanks .: 316
s .: 213
computron - me .: 155
63 .: 142
161 .: 139
doc: 127


In [None]:
from nltk.corpus import stopwords
import string # Import the string module
import nltk
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

# Get the list of English stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Define a function to preprocess tokens with recursive flattening and cleaning
def preprocess(tokens):
    cleaned_tokens = []

    def process_item(item):
        if isinstance(item, list):
            for sub_item in item:
                process_item(sub_item)
        elif isinstance(item, str):
            t_lower = item.lower()
            if t_lower not in string.punctuation and t_lower not in stop_words:
                cleaned_tokens.append(t_lower)
        # Optionally handle other types if necessary
        # else:
        #     print(f"Warning: Skipping non-list, non-string element: {item}")

    # Start processing from the top-level input
    if isinstance(tokens, list):
        process_item(tokens)
    # Optionally handle non-list top-level input if necessary
    # elif isinstance(tokens, str):
    #     process_item(tokens)


    return cleaned_tokens


# Assuming 'nltk_word_tokens' is a column in your DataFrame containing lists of word tokens
# If not, you'll need to create it first by tokenizing the 'text' column
# For example: df['nltk_word_tokens'] = df['text'].apply(lambda x: word_tokenize(str(x)))


# Apply preprocessing to the 'nltk_word_tokens' column
df['cleaned_nltk_word_tokens'] = df['nltk_word_tokens'].apply(preprocess)


# Display the updated DataFrame with cleaned tokens
print(df[['text', 'nltk_word_tokens', 'cleaned_nltk_word_tokens']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                                text  \
0  Subject: enron methanol ; meter # : 988291\r\n...   
1  Subject: hpl nom for january 9 , 2001\r\n( see...   
2  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3  Subject: photoshop , windows , office . cheap ...   
4  Subject: re : indian springs\r\nthis deal is t...   

                                    nltk_word_tokens  \
0  [Subject, :, enron, methanol, ;, meter, #, :, ...   
1  [Subject, :, hpl, nom, for, january, 9, ,, 200...   
2  [Subject, :, neon, retreat, ho, ho, ho, ,, we,...   
3  [Subject, :, photoshop, ,, windows, ,, office,...   
4  [Subject, :, re, :, indian, springs, this, dea...   

                            cleaned_nltk_word_tokens  
0  [subject, enron, methanol, meter, 988291, foll...  
1  [subject, hpl, nom, january, 9, 2001, see, att...  
2  [subject, neon, retreat, ho, ho, ho, around, w...  
3  [subject, photoshop, windows, office, cheap, m...  
4  [subject, indian, springs, deal, book, teco, p..

# Expt 3 - Stemmer

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stemmer & lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# Apply stemming to cleaned word tokens
df['stemmed_words'] = df['cleaned_nltk_word_tokens'].apply(lambda tokens: [stemmer.stem(t) for t in tokens])

print("🔹 Sample before & after stemming:\n")
for i in range(3):
    print(f"Original : {df['cleaned_nltk_word_tokens'][i]}")
    print(f"Stemmed  : {df['stemmed_words'][i]}")
    print("-"*50)


# Apply lemmatization to cleaned word tokens
df['lemmatized_words'] = df['cleaned_nltk_word_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(t) for t in tokens])

print("🔹 Sample before & after lemmatization:\n")
for i in range(3):
    print(f"Original    : {df['cleaned_nltk_word_tokens'][i]}")
    print(f"Lemmatized  : {df['lemmatized_words'][i]}")
    print("-"*50)

🔹 Sample before & after stemming:

Original : ['subject', 'enron', 'methanol', 'meter', '988291', 'follow', 'note', 'gave', 'monday', '4', '3', '00', 'preliminary', 'flow', 'data', 'provided', 'daren', 'please', 'override', 'pop', 'daily', 'volume', 'presently', 'zero', 'reflect', 'daily', 'activity', 'obtain', 'gas', 'control', 'change', 'needed', 'asap', 'economics', 'purposes']
Stemmed  : ['subject', 'enron', 'methanol', 'meter', '988291', 'follow', 'note', 'gave', 'monday', '4', '3', '00', 'preliminari', 'flow', 'data', 'provid', 'daren', 'pleas', 'overrid', 'pop', 'daili', 'volum', 'present', 'zero', 'reflect', 'daili', 'activ', 'obtain', 'ga', 'control', 'chang', 'need', 'asap', 'econom', 'purpos']
--------------------------------------------------
Original : ['subject', 'hpl', 'nom', 'january', '9', '2001', 'see', 'attached', 'file', 'hplnol', '09', 'xls', 'hplnol', '09', 'xls']
Stemmed  : ['subject', 'hpl', 'nom', 'januari', '9', '2001', 'see', 'attach', 'file', 'hplnol', '09',

In [None]:
from collections import Counter

# Flatten lists
all_stemmed = [t for sublist in df['stemmed_words'] for t in sublist]
all_lemmatized = [t for sublist in df['lemmatized_words'] for t in sublist]

print("🔹 Top 10 Stemmed Tokens:")
print(Counter(all_stemmed).most_common(10))

print("\n🔹 Top 10 Lemmatized Tokens:")
print(Counter(all_lemmatized).most_common(10))


🔹 Top 10 Stemmed Tokens:
[('ect', 13908), ('subject', 8064), ('hou', 7289), ('enron', 6555), ('2000', 4386), ('com', 3709), ('deal', 3655), ('pleas', 3243), ('ga', 3072), ('``', 3020)]

🔹 Top 10 Lemmatized Tokens:
[('ect', 13908), ('subject', 8062), ('hou', 7289), ('enron', 6555), ('2000', 4386), ('com', 3709), ('deal', 3635), ('please', 3198), ('gas', 3036), ('``', 3020)]


Expt-3

Exercise 1: Implementation of TF IDF Model for an input text.


In [None]:
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize

# Create Lancaster Stemmer object
lancaster = LancasterStemmer()

# Example text
text = "The runner was running and easily outran the other runners."

# Tokenize text
tokens = word_tokenize(text)

# Apply Lancaster Stemmer
stemmed_words = [lancaster.stem(word) for word in tokens]

print("Original Words:", tokens)
print("Stemmed Words:", stemmed_words)



Original Words: ['The', 'runner', 'was', 'running', 'and', 'easily', 'outran', 'the', 'other', 'runners', '.']
Stemmed Words: ['the', 'run', 'was', 'run', 'and', 'easy', 'out', 'the', 'oth', 'run', '.']


Exercise 2: Implement N-Gram model for an input text.

In [None]:
# Simple Indic Stemmer for Hindi (Devanagari)

def hindi_stemmer(word):
    suffixes = ["ों", "ें", "ीं", "िए", "ियो", "ियाँ", "िया", "िये", "ियो", "ाओं", "ाएँ", "ाएं", "ाते", "ाता", "ाती", "ाना", "ाने", "ाकर", "ाओ", "ी", "ा"]
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[: -len(suffix)]
    return word

# Example Hindi text
hindi_words = ["लड़कियों", "लड़का", "खिलाड़ियों", "किताबें", "पढ़ाई"]

# Apply stemming
stemmed_hindi = [hindi_stemmer(word) for word in hindi_words]

print("Original:", hindi_words)
print("Stemmed:", stemmed_hindi)


Original: ['लड़कियों', 'लड़का', 'खिलाड़ियों', 'किताबें', 'पढ़ाई']
Stemmed: ['लड़किय', 'लड़क', 'खिलाड़िय', 'किताब', 'पढ़ाई']


# Expt 4-  Bag of words

In [None]:
from nltk.corpus import wordnet

def morphological_analysis(word):
    # Get lemma/root form
    lemma = lemmatizer.lemmatize(word)

    # Get morphological root using morphy
    morphy_root = wordnet.morphy(word)

    # Get all synsets for the word
    synsets = wordnet.synsets(word)

    return {
        "original": word,
        "lemma": lemma,
        "morphy_root": morphy_root,
        "pos_tags": list(set([s.pos() for s in synsets])) if synsets else [],
        "definitions": [s.definition() for s in synsets]
    }
    # Example
sample_words = ["running", "better", "studies", "flying"]
for w in sample_words:
    print(morphological_analysis(w))
    print("-"*50)



{'original': 'running', 'lemma': 'running', 'morphy_root': 'running', 'pos_tags': ['s', 'v', 'n', 'a'], 'definitions': ['(American football) a play in which a player attempts to carry the ball through or past the opposing team', 'the act of running; traveling on foot at a fast pace', 'the state of being in operation', 'the act of administering or being in charge of something', 'the act of participating in an athletic competition involving running on a track', "move fast by using one's feet, with one foot off the ground at any given time", "flee; take to one's heels; cut and run", 'stretch out over a distance, space, time, or scope; run or extend between two points or beyond a certain point', 'direct or control; projects, businesses, etc.', 'have a particular form', 'move along, of liquids', 'perform as expected when applied', 'change or be different within limits', 'run, stand, or compete for an office or a position', 'cause to emit recorded audio or video', 'move about freely and with

In [None]:
def generate_word_forms(word):
    forms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            forms.add(lemma.name())  # Add the lemma form
            if lemma.antonyms():     # Add antonyms if present
                forms.add(lemma.antonyms()[0].name())
    return forms
# Example
for w in sample_words:
    print(f"Word: {w}")
    print("Generated Forms:", generate_word_forms(w))
    print("-"*50)



Word: running
Generated Forms: {'take_to_the_woods', 'move', 'melt', 'play', 'black_market', 'hunt_down', 'melt_down', 'track', 'running_game', 'tend', 'work', 'break_away', 'pass', 'race', 'bunk', 'escape', 'lead', 'campaign', 'execute', 'carry', 'linear', 'flow', 'feed', 'ply', 'operate', 'unravel', 'head_for_the_hills', 'die_hard', 'hunt', 'course', 'idle', 'range', 'ladder', 'passing', 'operative', 'run', 'prevail', 'scarper', 'run_for', 'extend', 'endure', 'standing', 'track_down', 'function', 'go', 'consort', 'run_away', 'be_given', 'incline', 'bleed', 'turn_tail', 'running_play', 'persist', 'running', 'lam', 'functional', 'draw', 'scat', 'lean', 'working', 'hightail_it', 'guide', 'fly_the_coop', 'malfunction'}
--------------------------------------------------
Word: better
Generated Forms: {'break', 'amend', 'secure', 'effective', 'salutary', 'undecomposed', 'unspoiled', 'advantageously', 'badly', 'respectable', 'best', 'evil', 'worsen', 'serious', 'bad', 'beneficial', 'dependab

In [None]:
# Take first 5 unique cleaned words from the dataset
unique_words = list(set([w for sublist in df['cleaned_nltk_word_tokens'] for w in sublist]))[:5]

print("Morphological Analysis for sample dataset words:\n")
for w in unique_words:
    print(morphological_analysis(w))
    print("Generated Forms:", generate_word_forms(w))
    print("="*60)

Morphological Analysis for sample dataset words:

{'original': 'incubi', 'lemma': 'incubus', 'morphy_root': 'incubus', 'pos_tags': ['n'], 'definitions': ['a male demon believed to lie on sleeping persons and to have sexual intercourse with sleeping women', 'a situation resembling a terrifying dream', 'someone who depresses or worries others']}
Generated Forms: {'nightmare', 'incubus'}
{'original': '1871', 'lemma': '1871', 'morphy_root': None, 'pos_tags': [], 'definitions': []}
Generated Forms: set()
{'original': 'tianhe', 'lemma': 'tianhe', 'morphy_root': None, 'pos_tags': [], 'definitions': []}
Generated Forms: set()
{'original': 'ahe', 'lemma': 'ahe', 'morphy_root': None, 'pos_tags': [], 'definitions': []}
Generated Forms: set()
{'original': 'gratuity', 'lemma': 'gratuity', 'morphy_root': 'gratuity', 'pos_tags': ['n'], 'definitions': ['a relatively small amount of money given for services rendered (as by a waiter)', 'an award (as for meritorious service) given without claim or obliga

# Expt 4 - TFIDF


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example corpus
corpus = [
    "The cat sat on the mat",
    "The dog sat on the log",
    "Cats and dogs are pets"
]

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

# Convert to DataFrame for better readability
import pandas as pd
df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

print(df)


        and       are       cat      cats       dog      dogs       log  \
0  0.000000  0.000000  0.427554  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.427554  0.000000  0.427554   
2  0.447214  0.447214  0.000000  0.447214  0.000000  0.447214  0.000000   

        mat        on      pets       sat       the  
0  0.427554  0.325166  0.000000  0.325166  0.650331  
1  0.000000  0.325166  0.000000  0.325166  0.650331  
2  0.000000  0.000000  0.447214  0.000000  0.000000  


Expt 4 - N-Gram


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter

# Ensure 'punkt' is downloaded for tokenization
nltk.download('punkt')

# Reload the original dataframe as it was overwritten in a previous cell
df = pd.read_csv("/content/spam_ham_dataset.csv", encoding='latin-1')

# Step 1: Tokenize the text column into words (if not done already)
df['cleaned_nltk_word_tokens'] = df['text'].apply(lambda x: word_tokenize(str(x).lower()))

# Step 2: Function to get N-Grams from tokenized text
def get_ngrams(text_tokens, n):
    return list(ngrams(text_tokens, n))

# Step 3: Create bigrams and trigrams
all_bigrams = []
all_trigrams = []

# Step 4: Loop through each tokenized text to generate n-grams
for tokens in df['cleaned_nltk_word_tokens']:
    all_bigrams.extend(get_ngrams(tokens, 2))  # Generate bigrams (n=2)
    all_trigrams.extend(get_ngrams(tokens, 3))  # Generate trigrams (n=3)

# Step 5: Frequency distribution
bigram_freq = Counter(all_bigrams)
trigram_freq = Counter(all_trigrams)

# Step 6: Show top 10 bigrams and trigrams
print("🔹 Top 10 Most Common Bigrams:")
for bigram, count in bigram_freq.most_common(10):
    print(f"{bigram}: {count}")

print("\n" + "-"*50 + "\n")

print("🔹 Top 10 Most Common Trigrams:")
for trigram, count in trigram_freq.most_common(10):
    print(f"{trigram}: {count}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


🔹 Top 10 Most Common Bigrams:
('-', '-'): 65612
('subject', ':'): 7854
('/', 'ect'): 7313
('/', 'hou'): 7278
('hou', '/'): 7278
('@', 'ect'): 6547
('ect', '@'): 6420
('.', '.'): 4350
('ect', ','): 4278
('>', '>'): 3810

--------------------------------------------------

🔹 Top 10 Most Common Trigrams:
('-', '-', '-'): 61156
('/', 'hou', '/'): 7278
('hou', '/', 'ect'): 7226
('/', 'ect', '@'): 6420
('ect', '@', 'ect'): 6338
('@', 'ect', ','): 4241
('.', '.', '.'): 3180
('>', '>', '>'): 2817
('?', '?', '?'): 2810
('=', '=', '='): 1726


EXP 5 Pos Tagging

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab') # Added to download the missing resource
nltk.download('averaged_perceptron_tagger_eng') # Added to download the missing resource


# Reload the original dataframe as it was overwritten or not loaded in the current runtime
import pandas as pd
df = pd.read_csv("/content/spam_ham_dataset.csv", encoding='latin-1')
from nltk.tokenize import word_tokenize
df['cleaned_nltk_word_tokens'] = df['text'].apply(lambda x: word_tokenize(str(x).lower()))

# Assuming 'cleaned_nltk_word_tokens' is a column in your DataFrame containing lists of word tokens
df['pos_tags'] = df['cleaned_nltk_word_tokens'].apply(lambda tokens: nltk.tag.pos_tag(tokens))

# Display the updated DataFrame with POS tags
print(df[['cleaned_nltk_word_tokens', 'pos_tags']].head())

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


                            cleaned_nltk_word_tokens  \
0  [subject, :, enron, methanol, ;, meter, #, :, ...   
1  [subject, :, hpl, nom, for, january, 9, ,, 200...   
2  [subject, :, neon, retreat, ho, ho, ho, ,, we,...   
3  [subject, :, photoshop, ,, windows, ,, office,...   
4  [subject, :, re, :, indian, springs, this, dea...   

                                            pos_tags  
0  [(subject, NN), (:, :), (enron, NN), (methanol...  
1  [(subject, NN), (:, :), (hpl, NN), (nom, NN), ...  
2  [(subject, NN), (:, :), (neon, NN), (retreat, ...  
3  [(subject, NN), (:, :), (photoshop, NN), (,, ,...  
4  [(subject, NN), (:, :), (re, NN), (:, :), (ind...  


Exp 6 Chunks


In [None]:
import nltk
from nltk.chunk import RegexpParser
from nltk.tree import Tree
import os # Import the os module

# Assuming 'pos_tags' is a column in your DataFrame containing lists of (word, pos_tag) tuples
# If not, you'll need to perform POS tagging first (as done in the previous cell)

# Define a simple chunk grammar
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  VP: {<VB.*><NP|PP|CLAUSE>*} # Chunk verbs and their complements
  PP: {<IN><NP>}              # Chunk prepositions followed by noun phrases
  ENTITY: {<NNP|NNPS>+}      # Chunk sequences of one or more proper nouns (singular or plural)
"""

# Create a RegexpParser with the defined grammar
chunk_parser = RegexpParser(grammar)

# Process and display chunk trees for the first few entries in the DataFrame
num_samples = 3 # Display chunk trees for the first few entries to avoid excessive output

if not df.empty and 'pos_tags' in df.columns:
    print(f"🔹 Chunk Trees for the first {num_samples} entries:")
    for i in range(min(num_samples, len(df))):
        pos_tags = df['pos_tags'].iloc[i]
        if pos_tags: # Check if the list of pos_tags is not empty
            chunk_tree = chunk_parser.parse(pos_tags)

            print(f"\n--- Entry {i+1} ---")
            print("Text-based Tree:")
            print(chunk_tree)
            print("-" * 20)

            # Attempt to save the tree to a file for graphical visualization
            try:
                output_filename = f"chunk_tree_entry_{i+1}.ps"
                # Check if the parsed result is a Tree object before attempting to draw
                if isinstance(chunk_tree, Tree):
                    # Create a TreeView object and save to PostScript
                    # Note: This might require ghostscript to be installed in the environment
                    # and still might not display directly as an image inline.
                    # It saves a .ps file that you would need to download and view.
                    from nltk.draw.tree import TreeView
                    TreeView(chunk_tree).save(output_filename)
                    print(f"Graphical tree saved to {output_filename}")
                else:
                    print(f"Skipping graphical save for Entry {i+1} as the parsed result is not a Tree object.")

            except Exception as e:
                print(f"Could not generate graphical tree for Entry {i+1}: {e}")
                print("Graphical visualization in Colab can be challenging.")

        else:
            print(f"\n--- Entry {i+1} ---")
            print("No POS tags found for this entry.")
            print("-" * 20)

else:
    print("DataFrame is empty or 'pos_tags' column not found.")

🔹 Chunk Trees for the first 3 entries:

--- Entry 1 ---
Text-based Tree:
(S
  (NP subject/NN)
  :/:
  (NP enron/NN methanol/NN)
  ;/:
  meter/CC
  #/#
  :/:
  988291/CD
  (NP this/DT)
  (VP is/VBZ (NP a/DT follow/JJ))
  up/RB
  to/TO
  (NP the/DT note/NN i/NN)
  (VP gave/VBD)
  you/PRP
  (PP on/IN (NP monday/NN))
  ,/,
  4/CD
  (NP //NN)
  3/CD
  (NP //NN)
  00/CD
  {/(
  (NP preliminary/JJ flow/NN data/NNS)
  (VP provided/VBN)
  (PP by/IN (NP daren/NN))
  }/)
  ./.
  (VP please/VB (NP override/JJ pop/NN))
  '/POS
  (NP s/JJ daily/JJ volume/NN)
  {/(
  presently/RB
  zero/CD
  }/)
  to/TO
  (VP reflect/VB (NP daily/JJ activity/NN))
  you/PRP
  can/MD
  (VP obtain/VB)
  (PP from/IN (NP gas/NN control/NN))
  ./.
  (NP this/DT change/NN)
  (VP is/VBZ)
  (VP needed/VBN (NP asap/NN))
  (PP for/IN (NP economics/NN purposes/NNS))
  ./.)
--------------------
Could not generate graphical tree for Entry 1: no display name and no $DISPLAY environment variable
Graphical visualization in Colab can 

EXP 7 Named Entity Recognition

In [None]:
import nltk
from nltk.chunk import ne_chunk # Import ne_chunk
from nltk.tree import Tree
import os # Import os for file operations

# Ensure the required NLTK data for NER is downloaded
try:
    nltk.data.find('chunkers/maxent_ne_chunker_tab/english_ace_multiclass/')
except LookupError:
    nltk.download('maxent_ne_chunker_tab')

# Download the 'words' corpus if not already present
try:
    nltk.data.find('corpora/words')
except LookupError:
    nltk.download('words')


# Apply NER only on first 10 rows for speed
sample_df = df.head(10).copy()

def extract_named_entities(pos_tags):
    tree = ne_chunk(pos_tags)
    entities = []
    for subtree in tree:
        if isinstance(subtree, Tree):
            entity_name = " ".join([token for token, pos in subtree.leaves()])
            entity_type = subtree.label()
            entities.append((entity_name, entity_type))
    return entities

sample_df['named_entities'] = sample_df['pos_tags'].apply(extract_named_entities)

print(sample_df[['text', 'named_entities']])

                                                text named_entities
0  Subject: enron methanol ; meter # : 988291\r\n...             []
1  Subject: hpl nom for january 9 , 2001\r\n( see...             []
2  Subject: neon retreat\r\nho ho ho , we ' re ar...             []
3  Subject: photoshop , windows , office . cheap ...             []
4  Subject: re : indian springs\r\nthis deal is t...             []
5  Subject: ehronline web address change\r\nthis ...             []
6  Subject: spring savings certificate - take 30 ...             []
7  Subject: looking for medication ? we ` re the ...             []
8  Subject: noms / actual flow for 2 / 26\r\nwe a...             []
9  Subject: nominations for oct . 21 - 23 , 2000\...             []


EXP 8: WordNet & Brown Corpus

In [None]:
from nltk.corpus import wordnet as wn, brown
nltk.download('wordnet')
nltk.download('brown')

def get_wordnet_info(word):
    synsets = wn.synsets(word)
    if not synsets:
        return None

    synonyms = set()
    antonyms = set()
    for syn in synsets:
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())

    return {
        "word": word,
        "synonyms": list(synonyms)[:5],
        "antonyms": list(antonyms)[:5],
        "definition": synsets[0].definition()
    }

sample_words = ['win', 'claim', 'money', 'free', 'offer']

for w in sample_words:
    print(get_wordnet_info(w))
    print("-"*40)

# Display Brown Corpus sample
print("\nSample from Brown Corpus (news category):")
print(brown.words(categories='news')[:20])


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


{'word': 'win', 'synonyms': ['acquire', 'advance', 'winnings', 'profits', 'win'], 'antonyms': ['losings', 'fall_back', 'lose', 'fail'], 'definition': 'a victory (as in a race or other competition)'}
----------------------------------------
{'word': 'claim', 'synonyms': ['lay_claim', 'arrogate', 'exact', 'title', 'claim'], 'antonyms': ['forfeit', 'disclaim'], 'definition': 'an assertion of a right (as to money or property)'}
----------------------------------------
{'word': 'money', 'synonyms': ['money'], 'antonyms': [], 'definition': 'the most common medium of exchange; functions as legal tender'}
----------------------------------------
{'word': 'free', 'synonyms': ['unblock', 'liberate', 'gratis', 'give_up', 'exempt'], 'antonyms': ['lodge', 'enforce', 'obstruct', 'freeze', 'bound'], 'definition': 'people who are free'}
----------------------------------------
{'word': 'offer', 'synonyms': ['put_up', 'declare_oneself', 'bid', 'propose', 'volunteer'], 'antonyms': [], 'definition': 'the

EXP 9: Word2Vec + Word Sense Disambiguation (WSD)

In [None]:
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer # Import the lemmatizer
from nltk.tokenize import word_tokenize # Import word_tokenize

# Reload the original dataframe as it was overwritten in a previous cell
import pandas as pd
df = pd.read_csv("/content/spam_ham_dataset.csv", encoding='latin-1')

# Re-apply tokenization and lemmatization to the reloaded DataFrame
df['cleaned_nltk_word_tokens'] = df['text'].apply(lambda x: word_tokenize(str(x).lower()))
lemmatizer = WordNetLemmatizer() # Initialize lemmatizer
df['lemmatized_words'] = df['cleaned_nltk_word_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(t) for t in tokens])


# Example function to get the correct WordNet sense
def get_wsd_sense(sentence_tokens):
    senses = []
    for word in sentence_tokens:
        synset = lesk(sentence_tokens, word)
        if synset:
            senses.append(synset.name())  # word sense identifier
        else:
            senses.append(word)  # fallback to original word
    return senses

# Apply WSD on your dataset (example for first 5 rows)
df['wsd_senses'] = df['lemmatized_words'].apply(get_wsd_sense)

print(df[['text', 'lemmatized_words', 'wsd_senses']].head())

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                                                text  \
0  Subject: enron methanol ; meter # : 988291\r\n...   
1  Subject: hpl nom for january 9 , 2001\r\n( see...   
2  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3  Subject: photoshop , windows , office . cheap ...   
4  Subject: re : indian springs\r\nthis deal is t...   

                                    lemmatized_words  \
0  [subject, :, enron, methanol, ;, meter, #, :, ...   
1  [subject, :, hpl, nom, for, january, 9, ,, 200...   
2  [subject, :, neon, retreat, ho, ho, ho, ,, we,...   
3  [subject, :, photoshop, ,, window, ,, office, ...   
4  [subject, :, re, :, indian, spring, this, deal...   

                                          wsd_senses  
0  [subject.n.06, :, enron, methanol.n.01, ;, met...  
1  [submit.v.01, :, hpl, nom, for, january.n.01, ...  
2  [subject.n.05, :, neon.n.01, retreat.v.04, hol...  
3  [subject.n.05, :, photoshop, ,, windowpane.n.0...  
4  [subject.n.06, :, ra.n.02, :, indian.a.01, spr..

In [None]:
# Combine lemmatized words back into a single string for each email
df['processed_text'] = df['lemmatized_words'].apply(lambda x: " ".join(x))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # You can tune max_features
X = vectorizer.fit_transform(df['processed_text'])


In [None]:
df['label'] = df['label_num'] if 'label_num' in df.columns else (df['label'].map({'spam':1, 'ham':0}))
y = df['label']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['processed_text'])
y = df['label']  # 0 = ham, 1 = spam

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train classifier
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

print("✅ Model trained successfully!")


✅ Model trained successfully!


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9835748792270531

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       735
           1       0.96      0.98      0.97       300

    accuracy                           0.98      1035
   macro avg       0.98      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035


Confusion Matrix:
 [[724  11]
 [  6 294]]


In [None]:
import pickle

# Save classifier
with open("/content/spam_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

# Save vectorizer
with open("/content/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("✅ Model & vectorizer saved locally!")


✅ Model & vectorizer saved locally!


In [None]:
from google.colab import files
files.download("/content/spam_classifier.pkl")
files.download("/content/tfidf_vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>