<a href="https://colab.research.google.com/github/pavankumarallu/K_ML/blob/main/spam_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sms-spam-collection-dataset/spam.csv


In [None]:
#NLP
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

#Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as tts

#Model
from sklearn.svm import SVC
from sklearn.metrics import f1_score

In [None]:
df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')

In [None]:
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


# Data Cleaning

In [None]:
df = df.drop([df.columns[col] for col in [2,3,4]],axis=1)

# Preprocessing

In [None]:
encoder = LabelEncoder()

df['v1'] = encoder.fit_transform(df['v1'])
class_mappings = {index: label for index,label in enumerate(encoder.classes_)}

In [None]:
class_mappings

{0: 'ham', 1: 'spam'}

### Stemming words

In [None]:
def processEmail(contents):
    ps = PorterStemmer()
    
    contents = contents.lower()
    contents = re.sub(r'<[^<>]+>', ' ', contents)
    contents = re.sub(r'[0-9]+', 'number', contents)
    contents = re.sub(r'(http|https)://[^\s]*', 'httpaddr', contents)
    contents = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', contents)
    contents = re.sub(r'[$]+', 'dollar', contents)
    
    words = word_tokenize(contents)
    
    for i in range(len(words)):
        words[i] = re.sub(r'[^a-zA-Z0-9]', '', words[i])
        words[i] = ps.stem(words[i])
        
    words = [word for word in words if len(word) >= 1]
    
    return words

In [None]:
def getVocabulary(emails, vocab_length):
    vocabulary = dict()
    
    for i in range(len(emails)):
        emails[i] = processEmail(emails[i])
        for word in emails[i]:
            if word in vocabulary.keys():
                vocabulary[word] += 1
            else:
                vocabulary[word] = 1
                
    vocabulary = sorted(vocabulary.items(), key=lambda x: x[1], reverse=True)
    vocabulary = list(map(lambda x: x[0], vocabulary[0:vocab_length]))
    vocabulary = {index: word for index, word in enumerate(vocabulary)}
    
    return vocabulary


In [None]:
getVocabulary(df['v2'].to_list(),2500)

{0: 'i',
 1: 'number',
 2: 'to',
 3: 'you',
 4: 'a',
 5: 'the',
 6: 'u',
 7: 'and',
 8: 'it',
 9: 'is',
 10: 'in',
 11: 'me',
 12: 'my',
 13: 'for',
 14: 'your',
 15: 'call',
 16: 'have',
 17: 'do',
 18: 'that',
 19: 'of',
 20: 'on',
 21: 's',
 22: 'are',
 23: 'now',
 24: 'go',
 25: 'so',
 26: 'get',
 27: 'not',
 28: 'but',
 29: 'be',
 30: 'or',
 31: 'm',
 32: 'can',
 33: 'at',
 34: 'we',
 35: 'ur',
 36: 'will',
 37: 'if',
 38: 'with',
 39: 'nt',
 40: 'just',
 41: 'no',
 42: 'thi',
 43: 'how',
 44: 'gt',
 45: 'lt',
 46: 'up',
 47: 'what',
 48: 'come',
 49: 'when',
 50: 'from',
 51: 'ok',
 52: 'free',
 53: 'all',
 54: 'know',
 55: 'out',
 56: 'like',
 57: 'got',
 58: 'love',
 59: 'time',
 60: 'wa',
 61: 'day',
 62: 'want',
 63: 'good',
 64: 'll',
 65: 'then',
 66: 'there',
 67: 'text',
 68: 'am',
 69: 'he',
 70: 'onli',
 71: 'send',
 72: 'hi',
 73: 'need',
 74: 'one',
 75: 'txt',
 76: 'as',
 77: 'today',
 78: 'by',
 79: 'see',
 80: 'think',
 81: 'about',
 82: 'take',
 83: 'she',
 84: 'h

In [None]:
def getKey(dictionary, val):
    for key, value in dictionary.items():
        if value == val:
            return key

In [None]:
def getIndices(email, vocabulary):
    word_indices = set()
    
    for word in email:
        if word in vocabulary.values():
            word_indices.add(getKey(vocabulary, word))
    
    return word_indices

In [None]:
def getFeatureVector(word_indices,vocab_length):
    feature_vec = np.zeros(vocab_length)
    for i in word_indices:
        feature_vec[i] = 1
    
    return feature_vec

In [None]:
vocabulary  = getVocabulary(df['v2'].to_list(),2500)

emails = df['v2'].to_list()
emails = list(map(lambda x: processEmail(x),emails))


In [None]:
X = list(map(lambda x : getFeatureVector(getIndices(x,vocabulary),2500),emails))

In [None]:
X = pd.DataFrame(np.array(X).astype(np.int16))

In [None]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,1,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,1,0,1,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
Y = df['v1']

# Training

In [None]:
x_train,x_test,y_train,y_test = tts(X,Y,train_size=0.8,random_state=0)

In [None]:
model = SVC()
model.fit(x_train,y_train)

SVC()

# Performance

In [None]:
model.score(x_test,y_test)

0.9811659192825112

In [None]:
y_pred = model.predict(x_test)

In [None]:
f1_score(y_test,y_pred)

0.9329073482428114