In [3]:
# 1. Consider any text paragraph. Preprocess the text to remove any special characters and digits. Generate the summary using extractive summarization process.

import nltk
import re
import heapq
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

# Ensure necessary NLTK packages are downloaded
# nltk.download('punkt')
# nltk.download('stopwords')

# Our text paragraph
paragraph = """
The field of artificial intelligence (AI) is rapidly transforming the world around us. From self-driving cars to personalized medicine, AI's potential seems limitless. However, this rapid advancement also brings significant challenges. One of the primary concerns is the impact on the workforce, as automation could displace millions of jobs. Furthermore, ethical considerations surrounding bias in algorithms and data privacy are becoming increasingly important. It is crucial for researchers, policymakers, and the public to engage in a continuous dialogue to ensure that AI develops in a way that benefits all of humanity. The decisions we make today will shape the future of this powerful technology.
"""

# 1. Preprocess the text: remove special characters and digits
# Remove square brackets and digits (like [0-9]*)
formatted_text = re.sub(r'\[[0-9]*\]', ' ', paragraph)
# Remove special characters and digits, keep only letters and spaces
formatted_text = re.sub(r'[^a-zA-Z\s]', ' ', formatted_text)
# Remove extra whitespace
formatted_text = re.sub(r'\s+', ' ', formatted_text)

print("--- Preprocessed Text ---")
print(formatted_text)
print("-" * 30)

# 2. Tokenize the preprocessed text into sentences and words
sentences = sent_tokenize(paragraph) # Use original for sentence structure, but cleaned for words
words = word_tokenize(formatted_text.lower()) # Use lowercased, cleaned text for frequency

# 3. Create a word frequency table, removing stopwords
stop_words = set(stopwords.words("english"))
word_frequencies = {}

for word in words:
    if word not in stop_words:
        if word not in word_frequencies:
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

# 4. Compute weighted frequencies (normalize by the most frequent word)
if word_frequencies:
    maximum_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word] / maximum_frequency

# 5. Score sentences based on the weighted frequencies of words they contain
sentence_scores = {}
for sent in sentences:
    for word, freq in word_frequencies.items():
        if word in sent.lower():
            if sent not in sentence_scores:
                sentence_scores[sent] = freq
            else:
                sentence_scores[sent] += freq

# 6. Generate the summary by selecting the top N sentences (e.g., top 2)
if sentence_scores:
    summary_sentences = heapq.nlargest(2, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    print("\n--- Generated Summary ---")
    print(summary)
else:
    print("\nCould not generate summary.")
    

--- Preprocessed Text ---
 The field of artificial intelligence AI is rapidly transforming the world around us From self driving cars to personalized medicine AI s potential seems limitless However this rapid advancement also brings significant challenges One of the primary concerns is the impact on the workforce as automation could displace millions of jobs Furthermore ethical considerations surrounding bias in algorithms and data privacy are becoming increasingly important It is crucial for researchers policymakers and the public to engage in a continuous dialogue to ensure that AI develops in a way that benefits all of humanity The decisions we make today will shape the future of this powerful technology 
------------------------------


<class 'LookupError'>: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/pyodide/nltk_data'
    - '/nltk_data'
    - '/share/nltk_data'
    - '/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# 2. Consider any text paragraph. Remove the stopwords. Tokenize the paragraph to extract words and sentences. Calculate the word frequency distribution and plot the frequencies. Plot the wordcloud of the text.

import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from wordcloud import WordCloud

# Ensure necessary NLTK packages are downloaded
# nltk.download('punkt')
# nltk.download('stopwords')

# Our text paragraph
paragraph = """
The rainforest is a lush, green world teeming with life. Tall trees stretch towards the sky, their branches forming a dense canopy. Beneath this, a world of shadows and moisture thrives, filled with exotic plants, insects, and animals. The sounds of the forest are a constant symphony: the calls of monkeys, the buzz of insects, and the songs of colorful birds. This vibrant ecosystem is vital for the planet's health, producing oxygen and regulating the climate.
"""

print("--- Original Paragraph ---")
print(paragraph)
print("-" * 50)

# 1. Tokenize the paragraph into sentences and words
tokenized_sentences = sent_tokenize(paragraph)
tokenized_words = word_tokenize(paragraph)

print("--- Tokenized Sentences ---")
print(tokenized_sentences)
print("\n--- Tokenized Words (First 20) ---")
print(tokenized_words[:20])

# 2. Remove stopwords
stop_words = set(stopwords.words("english"))
filtered_words = [word for word in tokenized_words if word.lower() not in stop_words and word.isalpha()] # Keep only alphabetic words

print("\n--- Words after Stopword Removal (First 20) ---")
print(filtered_words[:20])

# 3. Calculate and plot the word frequency distribution
if filtered_words:
    frequency_distribution = FreqDist(filtered_words)
    print("\n--- Most Common Words ---")
    print(frequency_distribution.most_common(5))

    print("\n--- Plotting Frequency Distribution ---")
    frequency_distribution.plot(15, cumulative=False) # Plot top 15
    plt.show()

    # 4. Plot the WordCloud
    print("--- Generating WordCloud ---")
    # Join the filtered words back into a single string for the wordcloud
    text_for_cloud = ' '.join(filtered_words)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_for_cloud)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
else:
    print("No words left after filtering to generate frequency plot or wordcloud.")

In [None]:
# 3. Consider the following review messages. Perform sentiment analysis on the messages.
# i. I purchased headphones online. I am very happy with the product.
# ii. I saw the movie yesterday. The animation was really good but the script was ok.
# iii. I enjoy listening to music
# iv. I take a walk in the park everyday

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Ensure the VADER lexicon is downloaded
# nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# List of review messages
reviews = [
    "I purchased headphones online. I am very happy with the product.",
    "I saw the movie yesterday. The animation was really good but the script was ok.",
    "I enjoy listening to music",
    "I take a walk in the park everyday"
]

print("--- Sentiment Analysis Results ---")
for i, review in enumerate(reviews):
    print(f"\nReview {i+1}: \"{review}\"")
    
    # Get sentiment scores
    sentiment_scores = analyzer.polarity_scores(review)
    
    # Determine overall sentiment based on compound score
    if sentiment_scores['compound'] >= 0.05:
        overall = "Positive"
    elif sentiment_scores['compound'] <= -0.05:
        overall = "Negative"
    else:
        overall = "Neutral"
    
    # Print the scores and overall sentiment
    print(f"  Scores: {sentiment_scores}")
    print(f"  Overall Sentiment: {overall}")

In [None]:
# 4. Perform text analytics on WhatsApp data : Write a Python script for the following :
# i. First Export the WhatsApp chat of any group. Read the exported ".txt" file using open() and read() functions.
# ii. Tokenize the read data into sentences and print it.
# iii. Remove the stopwords from data and perform lemmatization.
# iv. Plot the wordcloud for the given data.

import nltk
import re
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

# Ensure necessary NLTK packages are downloaded
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# --- i. Read the WhatsApp chat file ---
# IMPORTANT: Replace 'whatsapp_chat.txt' with the actual path to your exported chat file.
# WhatsApp chats often have datetime stamps, so we'll do basic cleaning to focus on the messages.
file_path = 'whatsapp_chat.txt'  # <-- UPDATE THIS PATH

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        chat_data = file.read()
    print("--- Chat File Read Successfully ---")
    # print(chat_data[:500]) # Print first 500 chars to see the format

except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please update the file path.")
    exit()

# Basic cleaning: Remove common WhatsApp timestamp patterns (e.g., [DD/MM/YY, HH:MM:SS] or DD/MM/YY, HH:MM - )
# This is a simplified example; actual timestamps may vary.
cleaned_text = re.sub(r'\[\d{1,2}\/\d{1,2}\/\d{2,4},?\s*\d{1,2}:\d{2}(?::\d{2})?[^\]]*\]\s*', '', chat_data)
cleaned_text = re.sub(r'\d{1,2}\/\d{1,2}\/\d{2,4},?\s*\d{1,2}:\d{2}\s*-\s*', '', cleaned_text)
# Remove "Sender: " patterns (optional)
cleaned_text = re.sub(r'^[^:]+:\s*', '', cleaned_text, flags=re.MULTILINE)


# --- ii. Tokenize the data into sentences ---
sentences = sent_tokenize(cleaned_text)
print(f"\n--- Tokenized into {len(sentences)} Sentences (First 5) ---")
for i, sent in enumerate(sentences[:5]):
    print(f"{i+1}. {sent}")

# --- iii. Remove stopwords and perform lemmatization ---
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Tokenize all words from the cleaned text for further processing
all_words = word_tokenize(cleaned_text.lower()) # Lowercase for consistency

processed_words = []
for word in all_words:
    # Keep only alphabetic words and remove stopwords
    if word.isalpha() and word not in stop_words:
        # Perform lemmatization (treating as verb 'v' as a common case, but can be refined)
        lemmatized_word = lemmatizer.lemmatize(word, pos='v')
        processed_words.append(lemmatized_word)

print(f"\n--- Processed {len(processed_words)} words after stopword removal and lemmatization (First 20) ---")
print(processed_words[:20])

# --- iv. Plot the wordcloud for the processed data ---
if processed_words:
    print("\n--- Generating WordCloud ---")
    text_for_cloud = ' '.join(processed_words)
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text_for_cloud)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("WordCloud of WhatsApp Chat")
    plt.show()
else:
    print("No words left after processing to generate wordcloud.")