# AT&T Spam Detector - BLOC 5
## PART 1 : Preprocessing
Developed by Myriam Goyet     
Contact : https://www.linkedin.com/in/myriamgoyet/

In [11]:
import pandas as pd
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.lang.en.stop_words import STOP_WORDS

# 1. Loading data

In [3]:
# Import the dataset from S3 bucket
data = pd.read_csv("https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Deep+Learning/project/spam.csv", encoding='ISO-8859-1' )
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Quick EDA

In [None]:
data.describe(include= "all")

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
data["v1"].value_counts(normalize=True) * 100
# 747 spams for 4825 hams. The spam messages are in the minority (13.4%)

v1
ham     86.593683
spam    13.406317
Name: proportion, dtype: float64

# 2. Global preprocessing

In [7]:
df1 = data.copy()

In [8]:
# Let's remove unneeded columns.
df1 = df1.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)
# Convert target's values to numerical
df1["v1"] = df1["v1"].replace({"spam": 1, "ham": 0}).astype(int)
# rename columns explicitly
df1.rename (columns={"v1":"target"}, inplace=True)
df1.rename (columns={"v2":"messages"}, inplace=True)
df1

  df1["v1"] = df1["v1"].replace({"spam": 1, "ham": 0}).astype(int)


Unnamed: 0,target,messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [9]:
print(f"There are {df1.duplicated().sum()} duplicated values in the dataset")
# Drop duplicated values
df1.drop_duplicates(inplace=True)
print("data shape after cleaning : ",df1.shape)

There are 403 duplicated values in the dataset
data shape after cleaning :  (5169, 2)


In [10]:
word_counts = [len(seq.split()) for seq in df1["messages"]]
max(word_counts)

171

In [12]:
# cleaning the messages
df1.loc[:, "clean_data"] = df1["messages"].apply(lambda x: ''.join(ch for ch in x if ch.isalnum() or ch == " " or ch == "'")) # keep only alphanumeric characters
df1.loc[:, "clean_data"] = df1["clean_data"].apply(lambda x: ' '.join(x.split()))  # This replaces multiple spaces with a single space
df1.loc[:, "clean_data"] = df1["clean_data"].str.lower()  # Lowercase the text
df1.loc[:, "clean_data"] = df1["clean_data"].str.strip()  # Strip leading and trailing spaces

# Lemmatization and removal of stopwords
df1.loc[:, "clean_lemma"] = df1["clean_data"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if token.lemma_ not in STOP_WORDS and token.text not in STOP_WORDS]))

In [13]:
# ensure all clean_lemma and all clean_data are strings
df1['clean_lemma'] = df1['clean_lemma'].fillna('').astype(str)
df1['clean_data'] = df1['clean_data'].fillna('').astype(str)

In [14]:
word_counts = [len(seq.split()) for seq in df1["clean_lemma"]]
max(word_counts)

74

In [15]:
df1.head()

Unnamed: 0,target,messages,clean_data,clean_lemma
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,jurong point crazy available bugis n great wor...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,u dun early hor u c
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i don't think he goes to usf he lives arou...,nah think usf live


In [16]:
# Saving data preprocessed
df1.to_csv('AT&T_data_preprocessed.csv', index=False)