In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import string
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
spam=pd.read_csv('/content/spam1.csv',encoding='latin8')

In [5]:
spam

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [6]:
spam=spam.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [7]:
spam

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
spam=spam.rename(columns={'v1':'Label','v2':'sms'})

In [9]:
spam

Unnamed: 0,Label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [10]:
spam['sms'].isnull().sum()

0

In [11]:
spam.isnull().sum()

Label    0
sms      0
dtype: int64

In [12]:
# performing nlp
punctuations=re.compile(r'[^/.\w\s]')
spam['sms']=spam['sms'].replace(punctuations,'')

In [13]:
spam

Unnamed: 0,Label,sms
0,ham,Go until jurong point crazy.. Available only i...
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,Nah I dont think he goes to usf he lives aroun...
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home
5569,ham,Pity was in mood for that. So...any other sug...
5570,ham,The guy did some bitching but I acted like id ...


In [14]:
type(spam['sms'])

pandas.core.series.Series

In [15]:
def expand(phrase):
  phrase = re.sub(r"won\'t", "will not",phrase)
  phrase = re.sub(r"can\'t", "can not",phrase)
  phrase = re.sub(r"n\'t", " not",phrase)
  phrase = re.sub(r"\'re", " are",phrase)
  phrase = re.sub(r"\'s", " is",phrase)
  phrase = re.sub(r"\'d", " would",phrase)
  phrase = re.sub(r"\'ll", " will",phrase)
  phrase = re.sub(r"\'t", " not",phrase)
  phrase = re.sub(r"\'ve", " have",phrase)
  phrase = re.sub(r"\'m", " am",phrase)
  return phrase
spam['sms']=spam['sms'].map(expand)

In [16]:
spam['sms']=spam['sms'].map(str.lower)

In [17]:
from nltk.corpus import stopwords

In [18]:
stop_words=set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')
def remove_stopwords(text):
  return " ".join(word for word in str(text).split() if word not in stop_words)

In [19]:
spam['sms']=spam['sms'].map(remove_stopwords)

In [20]:
spam['sms']

0       go jurong point crazy.. available bugis n grea...
1                           ok lar... joking wif u oni...
2       free entry 2 wkly comp win fa cup final tkts 2...
3               u dun say early hor... u c already say...
4             nah dont think goes usf lives around though
                              ...                        
5567    2nd time tried 2 contact u. u å750 pound prize...
5568                         ì_ b going esplanade fr home
5569                 pity mood that. so...any suggestions
5570    guy bitching acted like id interested buying s...
5571                                      rofl. true name
Name: sms, Length: 5572, dtype: object

In [21]:
from nltk.stem import PorterStemmer

In [22]:
stemmer=PorterStemmer()
def stemmer_words(text):
  return " ".join(stemmer.stem(word) for word in text.split() )

In [23]:
spam['sms']=spam['sms'].map(stemmer_words)

In [24]:
spam

Unnamed: 0,Label,sms
0,ham,go jurong point crazy.. avail bugi n great wor...
1,ham,ok lar... joke wif u oni...
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor... u c alreadi say...
4,ham,nah dont think goe usf live around though
...,...,...
5567,spam,2nd time tri 2 contact u. u å750 pound prize. ...
5568,ham,ì_ b go esplanad fr home
5569,ham,piti mood that. so...ani suggest
5570,ham,guy bitch act like id interest buy someth els ...


In [25]:
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [26]:
from nltk.stem import WordNetLemmatizer

In [27]:
lemmatizer=WordNetLemmatizer()
def word_lematizer(text):
  return ' '.join(lemmatizer.lemmatize(word) for word in text.split())

In [28]:
spam['sms']=spam['sms'].map(word_lematizer)

In [29]:
spam['sms'].values

array(['go jurong point crazy.. avail bugi n great world la e buffet... cine got amor wat...',
       'ok lar... joke wif u oni...',
       'free entri 2 wkli comp win fa cup final tkt 21st may 2005. text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18',
       ..., 'piti mood that. so...ani suggest',
       'guy bitch act like id interest buy someth el next week gave u free',
       'rofl. true name'], dtype=object)

In [None]:
[x for x in spam['sms']]

In [31]:
x=spam['sms']
y=spam['Label']

In [33]:
vectorizer=CountVectorizer()

In [None]:
x_vec=vectorizer.fit_transform(x)
vectorizer.vocabulary_

In [35]:
(x_train,x_test,y_train,y_test)=train_test_split(x_vec,y)

In [36]:
x_train

<4179x8319 sparse matrix of type '<class 'numpy.int64'>'
	with 36791 stored elements in Compressed Sparse Row format>

In [51]:
# Used Voting Class Classier

clf_1=KNeighborsClassifier(n_neighbors=3,metric='euclidean')
clf_2=GaussianNB()
clf_3=SVC(C=100,kernel='rbf')
clf_4=RandomForestClassifier(n_estimators=50,random_state=1)

vc=VotingClassifier(estimators=[('knn',clf_1),('nb',clf_2),('svm',clf_3),('rfc',clf_4)],voting='hard') 
vc.fit(x_train.toarray(),y_train)

VotingClassifier(estimators=[('knn',
                              KNeighborsClassifier(metric='euclidean',
                                                   n_neighbors=3)),
                             ('nb', GaussianNB()), ('svm', SVC(C=100)),
                             ('rfc',
                              RandomForestClassifier(n_estimators=50,
                                                     random_state=1))])

In [52]:
vc.score(x_test.toarray(),y_test)


0.9633883704235463

In [53]:
vc.score(x_train.toarray(),y_train)

1.0

In [73]:
# used SVM model to predict and methods like RandomSearchCV and GridSearchCV to find best suited hyperparameters
clf=SVC()
para = {'C':[10,10,0.1,0.01,1000],'kernel':['rbf','linear','poly']}

In [74]:
random=RandomizedSearchCV(clf,param_distributions=para,cv=5)

In [75]:
random.fit(x_train,y_train)

RandomizedSearchCV(cv=5, estimator=SVC(),
                   param_distributions={'C': [10, 10, 0.1, 0.01, 1000],
                                        'kernel': ['rbf', 'linear', 'poly']})

In [76]:
random.best_params_

{'kernel': 'linear', 'C': 10}

In [77]:
random.best_score_

0.9806157063862706

In [78]:
grid_search=GridSearchCV(clf,param_grid=para,cv=5)

In [79]:
grid_search.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [10, 10, 0.1, 0.01, 1000],
                         'kernel': ['rbf', 'linear', 'poly']})

In [80]:
grid_search.best_params_

{'C': 10, 'kernel': 'linear'}

In [81]:
grid_search.best_score_

0.9806157063862706