In [None]:
# Importing the libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [None]:
# Reading the data set in
data = pd.read_csv('Fraud Capstone.csv')

In [None]:
# verifiying the data set was read
data

Unnamed: 0,D2,D1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
# Getting rid of any null data in the data set so no false positives are created
data.isnull()

Unnamed: 0,D2,D1
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
5567,False,False
5568,False,False
5569,False,False
5570,False,False


In [None]:
# setting the column values for each corresponding column
X = data['D1'].values
y = data['D2'].values

In [None]:
# Setting the train test split we chose 0.3 as the value to test 30% of the data and train with the other 70%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=0)

In [None]:
# Converting into nuemerical features This converts the text into a matrix of token counts, to make it possible to go though the data had issues with the data set when I didn't do this
cv = CountVectorizer()
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [None]:
# Verifying The xtrain is correct
print(X_train)

  (0, 4005)	1
  (0, 4540)	3
  (0, 6108)	1
  (0, 2337)	1
  (0, 861)	1
  (1, 4487)	1
  (1, 6111)	1
  (1, 1509)	1
  (1, 6526)	1
  (1, 6342)	1
  (1, 7066)	1
  (1, 6378)	1
  (2, 6342)	3
  (2, 6378)	1
  (2, 2221)	1
  (2, 3955)	1
  (2, 3748)	3
  (2, 5955)	1
  (2, 835)	1
  (2, 2616)	1
  (2, 6312)	1
  (2, 3686)	1
  (2, 3447)	1
  (2, 4018)	1
  (2, 4407)	1
  :	:
  (3897, 6387)	1
  (3897, 4562)	1
  (3897, 4549)	1
  (3897, 1407)	1
  (3897, 6735)	1
  (3898, 7066)	1
  (3898, 3760)	1
  (3898, 3080)	1
  (3898, 5302)	1
  (3898, 3826)	1
  (3898, 2122)	1
  (3898, 7039)	1
  (3898, 3887)	1
  (3898, 5886)	1
  (3899, 6342)	2
  (3899, 7066)	3
  (3899, 3236)	1
  (3899, 758)	1
  (3899, 895)	1
  (3899, 6176)	1
  (3899, 6119)	1
  (3899, 1993)	1
  (3899, 3101)	1
  (3899, 5979)	1
  (3899, 6948)	2


In [None]:
# Verifying The x test is correct
print(X_test)

  (0, 784)	1
  (0, 1752)	1
  (0, 3552)	1
  (0, 3683)	1
  (0, 4780)	1
  (0, 5590)	1
  (0, 6342)	1
  (0, 6375)	1
  (0, 6573)	1
  (1, 2544)	1
  (1, 4520)	1
  (1, 6231)	1
  (1, 6761)	1
  (2, 829)	1
  (2, 860)	1
  (2, 915)	1
  (2, 939)	1
  (2, 1821)	1
  (2, 1858)	1
  (2, 2188)	2
  (2, 2488)	1
  (2, 2584)	1
  (2, 2692)	1
  (2, 3025)	1
  (2, 3133)	1
  :	:
  (1670, 3835)	1
  (1670, 3855)	1
  (1670, 3948)	1
  (1670, 4407)	1
  (1670, 4465)	1
  (1670, 4507)	1
  (1670, 5190)	1
  (1670, 5538)	1
  (1670, 5739)	1
  (1670, 6044)	1
  (1670, 6219)	1
  (1670, 6244)	1
  (1670, 6311)	1
  (1670, 6342)	2
  (1670, 6366)	1
  (1670, 6461)	1
  (1670, 6573)	1
  (1670, 6588)	1
  (1670, 6649)	1
  (1670, 7066)	2
  (1671, 1028)	1
  (1671, 2616)	1
  (1671, 2766)	1
  (1671, 4573)	1
  (1671, 4803)	2


In [None]:
# Running the test goal is 95% or higher accuracy
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 10)
classifier.fit(X_train, y_train)
print(classifier.score(X_test,y_test))

0.9754784688995215


In [None]:
 # Creating the model for the dataset after we have run the initial test
 tuned_parameters = {'kernel': ['rbf','linear'], 'gamma': [1e-3, 1e-4],
                      'C': [1, 10, 100, 1000]}

 model = GridSearchCV(svm.SVC(), tuned_parameters)

 model.fit(X_train,y_train)

In [None]:
# Printing the results of the hypertuned model
print(model.score(X_test,y_test))

0.9814593301435407
