In [1]:
import nltk
import numpy as np
import pandas as pd

In [2]:
import chardet
with open('Downloads\spam.csv', 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.7270322499829184, 'language': ''}

In [3]:
dataset = pd.read_csv('Downloads\spam.csv',encoding='Windows-1252')

In [4]:
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
dataset = dataset[['v1','v2']]

In [6]:
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#Exploring our dataset 

In [8]:
print(dataset.shape)

(5572, 2)


In [9]:
dataset.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [11]:
print('Out of {} rows, {} are spam, {} are ham'.format(len(dataset),len(dataset[dataset['v1']=='spam']),
                                                       len(dataset[dataset['v1']=='ham'])))

Out of 5572 rows, 747 are spam, 4825 are ham


In [12]:
# checing for number of null values
print('Number of null values in labels {}'.format(dataset['v1'].isnull().sum()))
print('Number of null values in labels {}'.format(dataset['v2'].isnull().sum()))

Number of null values in labels 0
Number of null values in labels 0


In [13]:
# How to use regular expression
# using regular expression in python
import re

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
# preprocessing
# cleaning our text 
# step1 removing puncation 
# step2 tokanization
# step3 removing stopwords
# step4 stemming 

In [16]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
X = list()
for i in range(0,5572):
    temp = list()
    dataset['v2'].loc[i] = re.sub(r'[^\w]', ' ', dataset['v2'].loc[i])
    temp = re.split("\s",dataset['v2'].loc[i])
    temp = ' '.join(temp).split()
    temp = [x.lower() for x in temp]
    temp = [word for word in temp if not word in stop_words]
#     temp = [ps.stem(x) for x in temp]
    temp = [wn.lemmatize(x) for x in temp]
    sent = ' '.join(temp)
    X.append(sent)
    dataset['v2'].loc[i] = sent

In [17]:
print(dataset.head())
print(X[0])

     v1                                                 v2
0   ham  go jurong point crazy available bugis n great ...
1   ham                            ok lar joking wif u oni
2  spam  free entry 2 wkly comp win fa cup final tkts 2...
3   ham                u dun say early hor u c already say
4   ham                nah think go usf life around though
go jurong point crazy available bugis n great world la e buffet cine got amore wat


In [18]:
# vectorizing raw data: count vectorization
# count vectorizaion 
# create a document matrix where the entry of each cell will
# be a count of the number of times that word occured in that
# document 

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(dataset['v2'])
vector = count_vect.transform(dataset['v2'])
vector.toarray()
vector[0].toarray()
dataset.head

<bound method NDFrame.head of         v1                                                 v2
0      ham  go jurong point crazy available bugis n great ...
1      ham                            ok lar joking wif u oni
2     spam  free entry 2 wkly comp win fa cup final tkts 2...
3      ham                u dun say early hor u c already say
4      ham                nah think go usf life around though
...    ...                                                ...
5567  spam  2nd time tried 2 contact u u å 750 pound prize...
5568   ham                       ì_ b going esplanade fr home
5569   ham                               pity mood suggestion
5570   ham  guy bitching acted like interested buying some...
5571   ham                                     rofl true name

[5572 rows x 2 columns]>

In [20]:
df = pd.DataFrame(vector.toarray())

In [21]:
vector.shape

(5572, 8014)

In [22]:
count_vect.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

In [23]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8004,8005,8006,8007,8008,8009,8010,8011,8012,8013
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
X = df.iloc[:,:]

In [25]:
y = dataset.iloc[:,:1].values

In [26]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)
y

  y = column_or_1d(y, warn=True)


array([0, 0, 1, ..., 0, 0, 0])

In [27]:
print(X.shape)
print(y.shape)

(5572, 8014)
(5572,)


In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

In [29]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179, 8014)
(1393, 8014)
(4179,)
(1393,)


In [30]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)

In [31]:
clf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [32]:
y_pred = clf.predict(X_test)

In [33]:
print("Test set accuracy: {:.2f}".format(clf.score(X_test, y_test)))

Test set accuracy: 0.92


In [34]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [35]:
cm

array([[1196,    0],
       [ 109,   88]], dtype=int64)