<a href="https://colab.research.google.com/github/nijatmaharramov/My-Projects/blob/main/Spam_email_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing and understanding the data

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features.

In [None]:
df = pd.read_csv('/content/drive/MyDrive/mail_data.csv')

In [None]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [None]:
df[df['Category'] == 'spam']

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [None]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


# Label encoding

spam email = 1
ham email = 0

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Category_encoded'] = le.fit_transform(df['Category'])
df.head()

Unnamed: 0,Category,Message,Category_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
# dropping the Category column, since we have new encoded column
df.drop(columns=['Category'], axis=1, inplace=True)
df.head()

Unnamed: 0,Message,Category_encoded
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
# Splitting data to labels and content
X = df['Message']
y = df['Category_encoded']

In [None]:
# Splitting to train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

# Feature extraction(Tfidf vectorizer)

In [None]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [None]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
X_train

Unnamed: 0,Message
1978,Reply to win £100 weekly! Where will the 2006 ...
3989,Hello. Sort of out in town already. That . So ...
3935,How come guoyang go n tell her? Then u told her?
4078,Hey sathya till now we dint meet not even a si...
4086,Orange brings you ringtones from all time Char...
...,...
3772,"Hi, wlcome back, did wonder if you got eaten b..."
5191,"Sorry, I'll call later"
5226,Prabha..i'm soryda..realy..frm heart i'm sory
5390,Nt joking seriously i told


In [None]:
X_train_vectorized

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34840 stored elements and shape (4457, 7440)>

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_vectorized, y_train)
lr.score(X_test_vectorized, y_test)

0.967713004484305

In [None]:
lr.score(X_train_vectorized, y_train)

0.9670181736594121

# Evaluating the model

In [None]:
from sklearn.metrics import accuracy_score
prediction_on_test_set = lr.predict(X_test_vectorized)
accuracy_on_test_set = accuracy_score(y_test, prediction_on_test_set)
print(f'Accuracy on test set is: {accuracy_on_test_set}')

Accuracy on test set is: 0.967713004484305


In [None]:
prediction_on_train_set = lr.predict(X_train_vectorized)
accuracy_on_train_set = accuracy_score(y_train, prediction_on_train_set)
print(f'Accuracy on train set is: {accuracy_on_train_set}')

Accuracy on train set is: 0.9670181736594121


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, prediction_on_test_set)
precision = precision_score(y_test, prediction_on_test_set)
recall = recall_score(y_test, prediction_on_test_set)
f1 = f1_score(y_test, prediction_on_test_set)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')

Accuracy: 0.9677
Precision: 1.0000
Recall: 0.7584
F1-score: 0.8626


In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, prediction_on_test_set)
display(cm)

array([[966,   0],
       [ 36, 113]])

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, prediction_on_test_set, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.964072,1.0,0.981707,966.0
1,1.0,0.758389,0.862595,149.0
accuracy,0.967713,0.967713,0.967713,0.967713
macro avg,0.982036,0.879195,0.922151,1115.0
weighted avg,0.968873,0.967713,0.96579,1115.0


# Predictive system

In [None]:
def predict_message_category(message):
  message_vectorized = vectorizer.transform([message])

  prediction = lr.predict(message_vectorized)[0]

  if prediction == 0:
    return 'ham'
  else:
    return 'spam'

In [None]:
sample_message = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
pred = predict_message_category(sample_message)
print(f"The given email is predicted as: {pred}")

The given email is predicted as: spam
