In [8]:
import pandas as pd
import chardet
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [9]:
# Load the dataset
file_path = 'spam.csv'

# Detect file encoding
with open(file_path, 'rb') as f:
    result = chardet.detect(f.read())

# Load the dataset with detected encoding
df = pd.read_csv(file_path, encoding=result['encoding'])

In [10]:
# Basic text cleaning function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove leading/trailing whitespaces
    return text

In [11]:
# Apply preprocessing
df['v2'] = df['v2'].apply(preprocess_text)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['v2'])

In [12]:
# Encode the labels
y = df['v1'].apply(lambda x: 1 if x == 'spam' else 0)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Train a Logistic Regression classifier
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)

# Make predictions
lr_pred = lr_classifier.predict(X_test)

In [14]:
# Evaluate Logistic Regression
print("Logistic Regression Classifier")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

Logistic Regression Classifier
Accuracy: 0.9605381165919282
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.71      0.83       150

    accuracy                           0.96      1115
   macro avg       0.98      0.85      0.90      1115
weighted avg       0.96      0.96      0.96      1115



In [4]:
# The classification report in this case is a detailed summary of the performance of the Logistic Regression model on the test set.
# It includes key metrics that help evaluate the quality and effectiveness of the classification model.
# The report typically contains the following metrics for each class (spam and ham in this case):
# 
1**Precisi**: The ratio of correctly predicted positive observations to the total predicted positive
# . It indicates how many of the messages labeled as spam by the model are actually spam# .
Precison(( = True Posti))ves / (True Positives + False Posies) \# ]
2. **Recall (Sensitivity or True Positive te)**: The ratio of correctly predicted positive observations to all the observations in the actal c 
# ass. It shows how many of the actual spam messages were correctly identified b modey t# e modRecall = (True Positives) / (True Positives + False ives)N4#  \]
3. **FScore**: The weighted average of Precision and Recall. The F1-Score ranges between 0 and
#  1 and is useful when you need a balance betweesion andn Preci R# F1- and =-Score = 2 * (Precision * Recall) / (Pr + Recall)6
#    

4. Support**: The number of actual occurrences of each class in the dataset. This indicates how many spam and ham messages a e in the
te# 5. Example Classificationle#  Classifi

Logistic Regression # Classifier
Accuracy: 0.9605# 116591928
2%              precision     recall    f1-scor#   support
%1       0(ham)      0.96       1.00        0.8#        965%
      1(spam)      1.00       0.71       150.3#        15
%
     accuracy                             0.6#       1115%
    macro avg      0.98       0.85        0.0 #       1115% 
weighted avg      0.96       0.96        0.9  #  . 115e
``

6## n# Interpretation

# -Precision for ham (0.99): 99% of the messages that were predicted as ham were actually ham.
# -Recall for ham (0.99): 99% of the actual ham messages were correctly identified by the model.
# -F1-Score for ham (0.99): The harmonic mean of Precision and Recall for ham is 0.99, indicating excellent performance.
# -Support for ham (965): There were 965 ham messages in the test set.

# -Precision for spam (0.93): 93% of the messages that were predicted as spam were actually spam.
# -Recall for spam (0.94): 94% of the actual spam messages were correctly identified by the model.
# -F1-Score for spam (0.93): The harmonic mean of Precision and Recall for spam is 0.93, indicating good performance.
# -Support for spam (150): There were 150 spam messages in the test set.

# -Accuracy (0.98): The overall accuracy of the model is 98%, meaning 98% of the total messages were correctly classified as either ham or spam.
# -Macro Average: The unweighted mean of the Precision, Recall, and F1-Score. This treats all classes equally regardless of their support.
# -Weighted Average: The average of the Precision, Recall, and F1-Score, weighted by the support of each class.
# This gives more importance to the classes with more samples.

# These metrics provide a comprehensive overview of how well the model is performing, and they are crucial for understanding the strengths
# and weaknesses of the classification model in distinguishing between spam and ham messages.w
'''een spam and ham messages.

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 24)