<a href="https://colab.research.google.com/github/manikanta741/Data-Science/blob/main/Spamdetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [114]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


import pandas as pd

# Load the dataset with proper encoding
data = pd.read_csv("/content/spam.csv", encoding='latin-1')

# Check the first few rows
print(data.head())
print(data.isnull().sum())

print(data.columns)


# Rename columns to meaningful names
data = data[['v1', 'v2']]  # Keep only useful columns
data.columns = ['label', 'message']  # Rename columns

# Display first few rows after renaming
print(data.head())


data.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)
# Convert labels to binary
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Check for missing values
print(data.isnull().sum())

# Display first few rows
print(data.head())


# Drop any NaN rows (if found)
data.dropna(inplace=True)

# Ensure label column is of integer type
data['label'] = data['label'].astype(int)

import nltk
nltk.download('stopwords')
nltk.download('punkt', force=True)


import nltk.data
tokenizer_path = nltk.data.find('tokenizers/punkt')
print("Punkt Tokenizer Found at:", tokenizer_path)
nltk.download('punkt')


import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary data
nltk.download('stopwords')
nltk.download('punkt',force=True)

# Define text preprocessing function
def clean_text(text):
    if isinstance(text, float):  # Check if text is NaN
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    words = text.split()

    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(words)

# Apply cleaning function to messages
data['cleaned_message'] = data['message'].apply(clean_text)

# Display sample cleaned messages
print(data[['message', 'cleaned_message']].head())

# 🚀 9️⃣ Define features (X) and target (y)
x= data['cleaned_message']  # Features (cleaned text)
y = data['label']  # Labels (0 for ham, 1 for spam)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
print(y_train.value_counts())  # Now should have both 0 and 1


vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train)  # Fit & transform training data
x_test_tfidf = vectorizer.transform(x_test)  # Transform test data



model = LogisticRegression()
model.fit(x_train_tfidf, y_train)


y_pred = model.predict(x_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


# New message for prediction
new_message = ["You have won a lottery! Claim now."]

# Convert to TF-IDF format
new_message_tfidf = vectorizer.transform(new_message)

# Predict using the trained model
prediction = model.predict(new_message_tfidf)

# Display result
print("Spam" if prediction[0] == 1 else "Ham")



     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to wi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['label'].map({'ham': 0, 'spam': 1})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

Punkt Tokenizer Found at: /root/nltk_data/tokenizers/punkt


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                     cleaned_message  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry 2 wkly comp win fa cup final tkts 2...  
3                u dun say early hor u c already say  
4             nah think goes usf lives around though  
(4457,)
(1115,)
(4457,)
(1115,)
label
0    3859
1     598
Name: count, dtype: int64
Model Accuracy: 96.95%
Spam


In [115]:
# New message for prediction
new_message = ["Hey, are you coming to the party tonight?"]

# Convert to TF-IDF format
new_message_tfidf = vectorizer.transform(new_message)

# Predict using the trained model
prediction = model.predict(new_message_tfidf)

# Display result
print("Spam" if prediction[0] == 1 else "Ham")


Ham
