# SPAM CLASSIFER
In this lab, we are building a model to classify whether a specific SMS is either a Spam, or a Ham(Non-Spam).

## Download the data

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!apt-get install unzip -y
!unzip -o "smsspamcollection.zip"

In [None]:
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd 

# data = pd.read_csv('./amazon-reviews.csv')   
read_file = pd.read_csv('SMSSpamCollection', sep='\t')
read_file.rename(columns = {'v1':'class_label', 'v2':'message'}, inplace = True)
data=read_file.to_csv('SMSSpamCollection.csv', header=None)
col_names = ["class_label", "message"]
df = pd.read_csv('SMSSpamCollection.csv', names=col_names)
#df = pd.read_csv('SMSSpamCollection.csv',encoding='ISO-8859-1')

df

In [None]:
#exploring the dataset
df['class_label'].value_counts()

In [None]:
#Split the Dataset

# convert our class labels from string to numeric form
df['class_label'] = df['class_label'].apply(lambda x: 1 if x == 'spam' else 0)



In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['message'], df['class_label'], test_size = 0.3, random_state = 0)

In [None]:
print('rows in test set: ' + str(x_test.shape))
print('rows in train set: ' + str(x_train.shape))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
lst = x_train.tolist()
vectorizer = TfidfVectorizer(
input= lst ,  # input is the actual text
lowercase=True,      # convert to lowercase before tokenizing
stop_words='english' # remove stop words
)
features_train_transformed = vectorizer.fit_transform(x_train) #gives tf idf vector for x_train
features_test_transformed  = vectorizer.transform(x_test) #gives tf idf vector for x_test

In [None]:
from sklearn.naive_bayes import MultinomialNB
# train the model
classifier = MultinomialNB()
classifier.fit(features_train_transformed, y_train)

In [None]:
print("classifier accuracy {:.2f}%".format(classifier.score(features_test_transformed, y_test) * 100))

In [None]:
labels = classifier.predict(features_test_transformed)
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
actual = y_test.tolist()
predicted = labels
results = confusion_matrix(actual, predicted)
print('Confusion Matrix :')
print(results)
print ('Accuracy Score :',accuracy_score(actual, predicted))
print ('Report : ')
print (classification_report(actual, predicted) )
score_2 = f1_score(actual, predicted, average = 'binary')
print('F-Measure: %.3f' % score_2)

## Using Sagemaker XGBOOST

In [None]:
import os
import boto3
import re
import json
import sagemaker
from sagemaker import get_execution_role

region = boto3.Session().region_name

role = get_execution_role()

bucket = sagemaker.Session().default_bucket()

In [None]:
prefix = "sagemaker/spam-claasifier"
bucketuri="s3://"+bucket+"/"+prefix
print(bucketuri)
# customize to your bucket where you have stored the data

In [None]:
import boto3
s3 = boto3.resource('s3')


s3.Bucket(bucket).upload_file("SMSSpamCollection.csv", "sagemaker/spam-claasifier/SMSSpamCollection.csv")

In [None]:
import numpy as np  
train_data, validation_data, test_data = np.split(df.sample(frac=1, random_state=1729), [int(0.7 * len(df)), int(0.9 * len(df))])   # Randomly sort the data then split out first 70%, second 20%, and last 10%

In [None]:
train_data.to_csv('train.csv', index=False, header=False)
validation_data.to_csv('validation.csv', index=False, header=False)

In [None]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

In [None]:
import sagemaker

region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

In [None]:
! pip install -qU sagemaker

In [None]:
pip install --upgrade pip

In [None]:
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model')
print(s3_output_location)
container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    train_volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]
)

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/validation'.format(bucket, prefix), content_type='csv')

In [None]:
xgb_model.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    num_round=100,
    rate_drop=0.3,
    tweedie_variance_power=1.4,
)



In [None]:
from sagemaker.session import TrainingInput

train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "train/train.csv"), content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "validation/validation.csv"), content_type="csv"
)

In [None]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)