# Comparisons between different classification algorithms

In [86]:
# Import libraries

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 

%matplotlib inline

In [87]:
# Import dataset
f_path = './dataset/SMSSpamCollection.csv'
dataset = pd.read_csv(f_path,sep='\t',names=["label","message"])

In [None]:
dataset.head(10)

In [None]:
# A graph illustrating the ratio between ham and spam
plt.rcParams["figure.figsize"] = [8,10] 
dataset.label.value_counts().plot(kind='pie', autopct='%1.0f%%')

In [108]:
# Creates a list that contains a number of words in ham messages and their count of occurrence in the dataset
dataset_ham = dataset[dataset['label'] == "ham"]
dataset_ham_count = dataset_ham['message'].str.split().str.len()
dataset_ham_count.index = dataset_ham_count.index.astype(str) + ' words:'
dataset_ham_count.sort_index(inplace=True)

In [107]:
# Creates a list that contains a number of words in spam messages and their count of occurrence in the dataset
dataset_spam = dataset[dataset['label'] == "spam"]
dataset_spam_count = dataset_spam['message'].str.split().str.len()
dataset_spam_count.index = dataset_spam_count.index.astype(str) + ' words:'
dataset_spam_count.sort_index(inplace=True)

In [None]:
bins = np.linspace(0, 50, 10)

plt.hist([dataset_ham_count, dataset_spam_count], bins, label=['ham', 'spam'])
plt.legend(loc='upper right')
plt.show()

# The output shows that most of the ham messages contain 0 to 10 words 
# while the majority of spam messages are longer and contain between 20 to 30 words

## Data Preprocessing

In [88]:
X = dataset["message"]  
 
y = dataset["label"]

In [89]:
# removing numbers and special characters

def text_preprocess(sen): 

   sen = re.sub('[^a-zA-Z]', ' ', sen)
   sen = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)
   sen = re.sub(r'\s+', ' ', sen)

   return sen

In [90]:
# Convert dataframe to list after preprocessing
X_messages = [] 
messages = list(X) 
for mes in messages: 
    X_messages.append(text_preprocess(mes))

In [91]:
# Converting Text to Numbers
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf_vec = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english')) 
X = tfidf_vec.fit_transform(X_messages)


## Dividing Data into Training and Test Sets

In [95]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Comparison of 7 supervised algorithms

### Naive Bayes

In [96]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha = 0.5)
mnb.fit(X_train,y_train)

y_mnb = mnb.predict(X_test)

In [97]:
print('Naive Bayes Accuracy: ', accuracy_score( y_mnb , y_test))
print('Naive Bayes confusion_matrix: ', confusion_matrix(y_mnb, y_test))

Naive Bayes Accuracy:  0.9834888729361091
Naive Bayes confusion_matrix:  [[1203   19]
 [   4  167]]


### Random Forest

In [93]:
from sklearn.ensemble import RandomForestClassifier 

rf_clf = RandomForestClassifier(n_estimators=250, random_state=0) 
rf_clf.fit(X_train, y_train) 

y_pred = rf_clf.predict(X_test)

In [None]:
print('RandomForest confusion_matrix: ',confusion_matrix(y_test,y_pred)) 
print('RandomForest Accuracy: ',accuracy_score(y_test,y_pred))

### SVM Classification

In [None]:
from sklearn.svm import SVC

svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(X_train,y_train)
y_svc = svc.predict(X_test)

In [None]:
print('SVM Accuracy: ', accuracy_score(y_svc , y_test))
print('SVM confusion_matrix: ', confusion_matrix(y_svc, y_test))

### KNeighbors Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knc = KNeighborsClassifier(n_neighbors=100)
knc.fit(X_train,y_train)

y_knc = knc.predict(X_test)

In [None]:
print('KNeighbors Accuracy_score: ',accuracy_score(y_test,y_knc))
print('KNeighbors confusion_matrix: ', confusion_matrix(y_test, y_knc)) 

### Decision Tree Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(min_samples_split=7, random_state=252)
dtc.fit(X_train,y_train)
y_dtc = dtc.predict(X_test)  

In [None]:
print('Decision Tree Accuracy: ',accuracy_score(y_test,y_dtc))
print('Decision Tree confusion_matrix: ', confusion_matrix(y_dtc, y_test)) 

### Extra Tree Classification

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators=37, random_state=252)
etc.fit(X_train,y_train)
y_etc = etc.predict(X_test)

In [None]:
print('Extra Tree Accuracy_score: ',accuracy_score(y_test,y_etc))
print('Extra Tree confusion_matrix: ', confusion_matrix(y_etc, y_test))

### Bagging Classification

In [None]:
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier(n_estimators=9, random_state=252)
bc.fit(X_train,y_train)
y_bc = bc.predict(X_test)

In [None]:
print('Bagging Accuracy_score: ',accuracy_score(y_test,y_bc))
print('Bagging confusion_matrix: ', confusion_matrix(y_bc, y_test)) 

### Adaptive Boosting Classification

In [None]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(n_estimators=37, random_state=252)
abc.fit(X_train,y_train)
y_abc = abc.predict(X_test)

In [None]:
print('AdaBoost Accuracy_score: ',accuracy_score(y_test,y_abc))
print('AdaBoost confusion_matrix: ', confusion_matrix(y_abc, y_test)) 

### Example for new sentence prediction

In [None]:
sentence = ['love this place have been coming here since was kid The food is always good and the service is great It great place to go for quick bite to eat or late night snack You can go wrong with any of the items on the menu They have wide variety of appetizers entrees and desserts']
sen = tfidf_vec.transform(sentence).toarray() # Fit the Data

In [None]:
prediction_Naive_Bayes = mnb.predict(sen) # Naive Bayes
print("Naive_Bayes: ", prediction_Naive_Bayes)
prediction_RandomForest = rf_clf.predict(sen) # Random Forest
print("RandomForest: ", prediction_RandomForest)
prediction_SVM = svc.predict(sen) # SVM
print("SVM: ", prediction_SVM)
prediction_KNeighbors = knc.predict(sen) # KNeighbors
print("KNeighbors: ", prediction_KNeighbors)
prediction_Decision_Tree = dtc.predict(sen) # Decision_Tree
print("Decision Tree: ", prediction_Decision_Tree)
prediction_Bagging = bc.predict(sen) # Bagging
print("Bagging: ", prediction_Bagging)
prediction_AdaBoost = abc.predict(sen) # AdaBoost
print("AdaBoost: ", prediction_AdaBoost)


### Save the models

In [None]:
import pickle

with open('NaiveBayes_model','wb') as f:
    pickle.dump(mnb,f)

with open('RandomForest_model','wb') as f:
    pickle.dump(rf_clf,f)

with open('SVM_model','wb') as f:
    pickle.dump(svc,f)
    
with open('KNeighbors_model','wb') as f:
    pickle.dump(knc,f)

with open('DecisionTree_model','wb') as f:
    pickle.dump(dtc,f)

with open('Bagging_model','wb') as f:
    pickle.dump(bc,f)

with open('AdaBoost_model','wb') as f:
    pickle.dump(abc,f)

# Comparison with the larged dataset

In [98]:
import pandas as pd
f_path1 = './dataset/SMSSpamCollection.csv'
dataset = pd.read_csv(f_path1, sep='\t',names=["label","message"])

f_path2 = './dataset/SyntheticMessages_WithLabel.csv'
synthetic_dataset = pd.read_csv(f_path2)

In [99]:
frames = [dataset, synthetic_dataset]

large_dataset = pd.concat(frames)

print("dataset size: ",len(dataset))
print("synthetic dataset size: ",len(synthetic_dataset))
print("large dataset size: ",len(large_dataset))

dataset size:  5572
synthetic dataset size:  923
large dataset size:  6495


In [100]:
X = large_dataset["message"]  
y = large_dataset["label"]

In [101]:
X = large_dataset["message"].apply(str)

X_messages = [] 
messages = list(X) 
for mes in messages: 
    X_messages.append(text_preprocess(mes))

# Converting Text to Numbers
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf_vec = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english')) 
X = tfidf_vec.fit_transform(X_messages).toarray()

### Dividing Data into Training and Test Sets - Large dataset

In [102]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [103]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha = 0.5)
mnb.fit(X_train,y_train)

y_mnb = mnb.predict(X_test)

In [104]:
# Results

print('Naive Bayes Accuracy: ', accuracy_score( y_mnb , y_test))
print('Naive Bayes confusion_matrix: ', confusion_matrix(y_mnb, y_test))

Naive Bayes Accuracy:  0.9852216748768473
Naive Bayes confusion_matrix:  [[1434   22]
 [   2  166]]


In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier 

rf_clf = RandomForestClassifier(n_estimators=250, random_state=0) 
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

In [None]:
# Results

print('RandomForest confusion_matrix: ',confusion_matrix(y_test,y_pred)) 
print('RandomForest Accuracy: ',accuracy_score(y_test,y_pred))