In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Sample data: a small dataset of text messages
data = {
    'text': [
        'Congratulations! You have won a lottery of $1000.',
        'Click here to claim your prize.',
        'Hey, how are you doing?',
        'Don’t forget our meeting tomorrow at 10 AM.',
        'You have a new message from your bank.',
        'This is not spam, just a friendly reminder.',
        'Win a free iPhone by clicking this link!',
        'Let’s catch up sometime this week.'
    ],
    'label': [
        'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.25, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)


Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00         1
        spam       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



# Text Classification Example with Larger Sample Data and Normalization


In [4]:
# Import necessary libraries
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Sample data: a larger dataset of text messages
data = {
    'text': [
        'Congratulations! You have won a lottery of $1000.',
        'Click here to claim your prize.',
        'Hey, how are you doing?',
        'Don’t forget our meeting tomorrow at 10 AM.',
        'You have a new message from your bank.',
        'This is not spam, just a friendly reminder.',
        'Win a free iPhone by clicking this link!',
        'Let’s catch up sometime this week.',
        'Exclusive offer just for you! Buy now and save big.',
        'Your account has been compromised. Act now!',
        'This is a limited time offer. Don’t miss out!',
        'Hello! I hope you are having a great day.',
        'Your subscription will expire soon. Renew now.',
        'You have been selected for a special promotion.',
        'Meet me at the coffee shop at 3 PM.',
        'Important: Update your password immediately.',
        'Get paid to work from home! Sign up today.',
        'Reminder: Your appointment is tomorrow at 2 PM.',
        'Congratulations! You are our lucky winner!',
        'This is not spam, just checking in.',
        'You have a new message from your friend.'
    ],
    'label': [
        'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham',
        'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham',
        'spam', 'ham', 'spam', 'ham', 'ham'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Normalization function
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Apply normalization to the text data
df['text_norm'] = df['text'].apply(normalize_text)
df


Unnamed: 0,text,label,text_norm
0,Congratulations! You have won a lottery of $1000.,spam,congratulations you have won a lottery of 1000
1,Click here to claim your prize.,spam,click here to claim your prize
2,"Hey, how are you doing?",ham,hey how are you doing
3,Don’t forget our meeting tomorrow at 10 AM.,ham,don’t forget our meeting tomorrow at 10 am
4,You have a new message from your bank.,spam,you have a new message from your bank
5,"This is not spam, just a friendly reminder.",ham,this is not spam just a friendly reminder
6,Win a free iPhone by clicking this link!,spam,win a free iphone by clicking this link
7,Let’s catch up sometime this week.,ham,let’s catch up sometime this week
8,Exclusive offer just for you! Buy now and save...,spam,exclusive offer just for you buy now and save big
9,Your account has been compromised. Act now!,spam,your account has been compromised act now


In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text_norm'], df['label'], test_size=0.25, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.67
Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.33      0.50         3
        spam       0.60      1.00      0.75         3

    accuracy                           0.67         6
   macro avg       0.80      0.67      0.62         6
weighted avg       0.80      0.67      0.62         6



# Text Classification Using the SMS Spam Collection Dataset

In [6]:
# Import necessary libraries
import pandas as pd
import zipfile
import io
import requests

# Define the URL of the ZIP file containing the dataset
zip_file_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

# Function to download and extract the dataset
def load_sms_spam_collection(url):
    # Download the ZIP file
    response = requests.get(url)
    response.raise_for_status()  # Check for request errors

    # Use BytesIO to read the ZIP file from the response content
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        # List all files in the ZIP
        print("Files in the ZIP archive:", z.namelist())
        
        # Specify the file to read
        file_to_read = 'SMSSpamCollection'  # Change this to the desired file name

        # Read the specific file from the ZIP
        df = pd.read_csv(z.open(file_to_read), sep='\t', names=['label', 'text'], encoding='utf-8')
    
    return df

# Load the SMS Spam Collection Dataset
df = load_sms_spam_collection(zip_file_url)

# Display the first few rows of the DataFrame
df


Files in the ZIP archive: ['SMSSpamCollection', 'readme']


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
# Preprocess the text data
df['text_norm'] = df['text'].str.replace('[{}]'.format(string.punctuation), '', regex=True).str.lower()
df

Unnamed: 0,label,text,text_norm
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?,will ü b going to esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggest...
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...


In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text_norm'], df['label'], test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Display the results
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.96
Classification Report:
              precision    recall  f1-score   support

         ham       0.95      1.00      0.98       966
        spam       1.00      0.68      0.81       149

    accuracy                           0.96      1115
   macro avg       0.98      0.84      0.89      1115
weighted avg       0.96      0.96      0.95      1115



In [9]:
import random
import re
from collections import defaultdict
from math import log

# Generate labeled training data
training_data = [
    ("I love this product, it is amazing!", "positive"),
    ("This is the best thing I have ever bought.", "positive"),
    ("Absolutely fantastic, I am very happy with it.", "positive"),
    ("I am so pleased with this purchase.", "positive"),
    ("This is wonderful, I highly recommend it.", "positive"),
    ("I am extremely satisfied with this.", "positive"),
    ("This is a great product, I will buy it again.", "positive"),
    ("I am very impressed with the quality.", "positive"),
    ("This is exactly what I needed, perfect!", "positive"),
    ("I am delighted with this item.", "positive"),
    ("I hate this product, it is terrible.", "negative"),
    ("This is the worst thing I have ever bought.", "negative"),
    ("Absolutely awful, I am very unhappy with it.", "negative"),
    ("I am so disappointed with this purchase.", "negative"),
    ("This is horrible, I do not recommend it.", "negative"),
    ("I am extremely dissatisfied with this.", "negative"),
    ("This is a terrible product, I will never buy it again.", "negative"),
    ("I am very unimpressed with the quality.", "negative"),
    ("This is not what I needed, awful!", "negative"),
    ("I am disgusted with this item.", "negative")
]


# Preprocess text
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

# Train Naive Bayes classifier
class NaiveBayesClassifier:
    def __init__(self):
        self.classes = defaultdict(lambda: 0)
        self.word_counts = defaultdict(lambda: defaultdict(lambda: 0))
        self.class_totals = defaultdict(lambda: 0)
        self.vocab = set()

    def train(self, data):
        for sentence, label in data:
            self.classes[label] += 1
            words = preprocess(sentence).split()
            for word in words:
                self.word_counts[label][word] += 1
                self.class_totals[label] += 1
                self.vocab.add(word)

    def predict(self, sentence):
        words = preprocess(sentence).split()
        class_scores = {}
        for label in self.classes:
            class_scores[label] = log(self.classes[label] / sum(self.classes.values()))
            for word in words:
                word_count = self.word_counts[label][word] + 1  # Laplace smoothing
                class_scores[label] += log(word_count / (self.class_totals[label] + len(self.vocab)))
        return max(class_scores, key=class_scores.get)

# Initialize and train the classifier
classifier = NaiveBayesClassifier()
classifier.train(training_data)

# Test the classifier on 5 sentences
test_sentences = [
    "I am very happy with this product.",
    "This is the worst purchase I have ever made.",
    "Absolutely fantastic, highly recommend it.",
    "I am so disappointed, this is terrible.",
    "This is a great product, very satisfied."
]

# Predict and print the results
for sentence in test_sentences:
    prediction = classifier.predict(sentence)
    print(f"Sentence: {sentence}\nPredicted Sentiment: {prediction}\n")

Sentence: I am very happy with this product.
Predicted Sentiment: positive

Sentence: This is the worst purchase I have ever made.
Predicted Sentiment: negative

Sentence: Absolutely fantastic, highly recommend it.
Predicted Sentiment: positive

Sentence: I am so disappointed, this is terrible.
Predicted Sentiment: negative

Sentence: This is a great product, very satisfied.
Predicted Sentiment: positive



In [11]:
from collections import defaultdict

# Initialize dictionaries to hold word counts for positive and negative sentences
positive_word_counts = defaultdict(int)
negative_word_counts = defaultdict(int)

# Count the words in positive and negative sentences
for sentence, sentiment in training_data:
    words = preprocess(sentence).split()
    if sentiment == 'positive':
        for word in words:
            positive_word_counts[word] += 1
    else:
        for word in words:
            negative_word_counts[word] += 1

# Combine the counts into a single dictionary for display
word_counts = {}
for word in set(positive_word_counts.keys()).union(set(negative_word_counts.keys())):
    word_counts[word] = {
        'positive': positive_word_counts[word],
        'negative': negative_word_counts[word]
    }

# Display the word counts
for word, counts in word_counts.items():
    print(f"Word: {word}, Positive: {counts['positive']}, Negative: {counts['negative']}")

Word: quality., Positive: 1, Negative: 1
Word: wonderful,, Positive: 1, Negative: 0
Word: again., Positive: 1, Negative: 1
Word: terrible., Positive: 0, Negative: 1
Word: exactly, Positive: 1, Negative: 0
Word: will, Positive: 1, Negative: 1
Word: pleased, Positive: 1, Negative: 0
Word: product,, Positive: 2, Negative: 2
Word: best, Positive: 1, Negative: 0
Word: not, Positive: 0, Negative: 2
Word: satisfied, Positive: 1, Negative: 0
Word: terrible, Positive: 0, Negative: 1
Word: extremely, Positive: 1, Negative: 1
Word: have, Positive: 1, Negative: 1
Word: highly, Positive: 1, Negative: 0
Word: great, Positive: 1, Negative: 0
Word: awful!, Positive: 0, Negative: 1
Word: ., Positive: 2, Negative: 2
Word: needed,, Positive: 1, Negative: 1
Word: ever, Positive: 1, Negative: 1
Word: recommend, Positive: 1, Negative: 1
Word: hate, Positive: 0, Negative: 1
Word: purchase., Positive: 1, Negative: 1
Word: disgusted, Positive: 0, Negative: 1
Word: with, Positive: 5, Negative: 5
Word: horrible,