<a href="https://colab.research.google.com/github/pavithra071105/AI_day3/blob/main/day4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



In [8]:
RANDOM_STATE = 42

In [9]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])


In [10]:
data['label_num'] = data['label'].map({'ham': 0, 'spam': 1})



In [11]:
TARGET_N = 2000
if len(data) < TARGET_N:
    raise ValueError(f"Dataset has only {len(data)} messages — need at least {TARGET_N}.")


In [12]:
prop = data['label'].value_counts(normalize=True)
n_spam = int(round(prop['spam'] * TARGET_N))
n_ham = TARGET_N - n_spam
sampled_spam = data[data['label']=='spam'].sample(n=n_spam, random_state=RANDOM_STATE)
sampled_ham  = data[data['label']=='ham'].sample(n=n_ham, random_state=RANDOM_STATE)
sampled = pd.concat([sampled_ham, sampled_spam]).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)


In [13]:
X = sampled['message']
y = sampled['label_num']
# first split out the test set of size 500
X_rest, X_test, y_rest, y_test = train_test_split(X, y, test_size=500, stratify=y, random_state=RANDOM_STATE)
# remaining 1500 will be training set
X_train, y_train = X_rest, y_rest


In [14]:
print("Train size:", len(X_train), "Test size:", len(X_test))
print("Spam ratio in train:", y_train.mean(), " Spam ratio in test:", y_test.mean())


Train size: 1500 Test size: 500
Spam ratio in train: 0.134  Spam ratio in test: 0.134


In [15]:
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)


In [16]:

model = LogisticRegression(max_iter=2000, random_state=RANDOM_STATE)
model.fit(X_train_tfidf, y_train)


In [17]:
y_pred = model.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", acc)



Model Accuracy: 0.916


In [18]:

print("\nCategories: ['Not Spam', 'Spam']\n")
sample_messages = X_test.reset_index(drop=True)[:300]   # first 30 test messages
sample_vectors = tfidf.transform(sample_messages)
probabilities = model.predict_proba(sample_vectors)


Categories: ['Not Spam', 'Spam']



In [19]:
for i, prob in enumerate(probabilities, start=1):
    spam_prob = prob[1]   # probability for class = 1 (spam)
    prediction = "SPAM" if spam_prob > 0.5 else "NOT SPAM"
    print(f"Email {i}: Sigmoid Output = {spam_prob:.4f} ({spam_prob*100:.1f}%) -> Prediction: {prediction}")


Email 1: Sigmoid Output = 0.0597 (6.0%) -> Prediction: NOT SPAM
Email 2: Sigmoid Output = 0.0588 (5.9%) -> Prediction: NOT SPAM
Email 3: Sigmoid Output = 0.0426 (4.3%) -> Prediction: NOT SPAM
Email 4: Sigmoid Output = 0.0213 (2.1%) -> Prediction: NOT SPAM
Email 5: Sigmoid Output = 0.0832 (8.3%) -> Prediction: NOT SPAM
Email 6: Sigmoid Output = 0.0704 (7.0%) -> Prediction: NOT SPAM
Email 7: Sigmoid Output = 0.0382 (3.8%) -> Prediction: NOT SPAM
Email 8: Sigmoid Output = 0.6879 (68.8%) -> Prediction: SPAM
Email 9: Sigmoid Output = 0.1359 (13.6%) -> Prediction: NOT SPAM
Email 10: Sigmoid Output = 0.0496 (5.0%) -> Prediction: NOT SPAM
Email 11: Sigmoid Output = 0.0652 (6.5%) -> Prediction: NOT SPAM
Email 12: Sigmoid Output = 0.0913 (9.1%) -> Prediction: NOT SPAM
Email 13: Sigmoid Output = 0.5395 (54.0%) -> Prediction: SPAM
Email 14: Sigmoid Output = 0.0880 (8.8%) -> Prediction: NOT SPAM
Email 15: Sigmoid Output = 0.0513 (5.1%) -> Prediction: NOT SPAM
Email 16: Sigmoid Output = 0.0845 (8.5%

In [20]:
print("\n" + "="*60 + "\nSample test message, true label, spam-probability:\n")
for i in range(5):
    msg = sample_messages.iloc[i]
    true_label = "SPAM" if y_test.reset_index(drop=True).iloc[i]==1 else "NOT SPAM"
    spam_prob = probabilities[i][1]
    print(f"Message {i+1} (True: {true_label}, Prob(spam)={spam_prob:.3f}):\n  {msg}\n")



Sample test message, true label, spam-probability:

Message 1 (True: NOT SPAM, Prob(spam)=0.060):
  Will be out of class in a few hours. Sorry

Message 2 (True: NOT SPAM, Prob(spam)=0.059):
  Yup i thk they r e teacher said that will make my face look longer. Darren ask me not 2 cut too short.

Message 3 (True: NOT SPAM, Prob(spam)=0.043):
  Yar i wanted 2 scold u yest but late already... I where got zhong se qing you? If u ask me b4 he ask me then i'll go out w u all lor. N u still can act so real.

Message 4 (True: NOT SPAM, Prob(spam)=0.021):
  I know where the  &lt;#&gt;  is, I'll be there around 5

Message 5 (True: NOT SPAM, Prob(spam)=0.083):
  Nothing just getting msgs by dis name wit different no's..

