In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


# Data Cleaning

In [None]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.rename(columns={"v1": "target", "v2": "text"}, inplace= True)

# Data Pre-Processing

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['target'] = encoder.fit_transform(df['target'])
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  5572 non-null   int64 
 1   text    5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [None]:
df.duplicated().sum()

np.int64(403)

In [None]:
df.drop_duplicates(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  5169 non-null   int64 
 1   text    5169 non-null   object
dtypes: int64(1), object(1)
memory usage: 121.1+ KB


# Featuer Engineering

In [None]:
from nltk.stem.porter import PorterStemmer
import string
ps = PorterStemmer()


In [None]:
def transform_text(text):
    # 1. Convert to lowercase
    text = text.lower()
    
    # 2. Tokenize using NLTK
    tokens = nltk.word_tokenize(text)
    
    # 3. Remove non-alphanumeric tokens
    alphanumeric_tokens = [token for token in tokens if token.isalnum()]
    
    # 4. Remove stopwords and punctuation
    filtered_tokens = [
        token for token in alphanumeric_tokens
        if token not in stopwords.words('english') and token not in string.punctuation
    ]
    
    # 5. Stemming
    stemmed_tokens = [ps.stem(token) for token in filtered_tokens]
    
    # 6. Join back into a single string
    return " ".join(stemmed_tokens)

In [None]:
df['text'][5]

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"

In [None]:
print(transform_text("FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\DELL/nltk_data'
    - 'f:\\Data Science\\MLOPS\\mlops_complete_pipeline_with_dvc_and_sagemaker\\base\\nltk_data'
    - 'f:\\Data Science\\MLOPS\\mlops_complete_pipeline_with_dvc_and_sagemaker\\base\\share\\nltk_data'
    - 'f:\\Data Science\\MLOPS\\mlops_complete_pipeline_with_dvc_and_sagemaker\\base\\lib\\nltk_data'
    - 'C:\\Users\\DELL\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:/Users/DELL/AppData/Roaming/nltk_data'
    - 'C:/Users/DELL/AppData/Roaming/nltk_data'
    - 'C:\\Users\\DELL\\AppData\\Roaming\\nltk_data'
    - 'C:\\Users\\DELL\\AppData\\Roaming\\nltk_data'
    - 'C:\\Users\\DELL\\AppData\\Roaming\\nltk_data\\tokenizers\\punkt'
    - 'C:/Users/DELL/AppData/Roaming/nltk_data/tokenizers/punkt/'
    - 'C:/Users/DELL/AppData/Roaming/nltk_data'
    - 'C:/Users/DELL/AppData/Roaming/nltk_data/tokenizers/punkt/'
    - 'C:/Users/DELL/AppData/Roaming/nltk_data/tokenizers/punkt/'
    - 'C:/Users/DELL/AppData/Roaming/nltk_data'
    - 'C:/Users/DELL/AppData/Roaming/nltk_data/tokenizers/punkt/'
**********************************************************************
