Header importing Libraries

In [1]:
import sklearn
from sklearn import svm
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Reading Data from Files

run only one of the following files

Reading from original files

In [10]:
data = pd.read_csv("TrainingData.txt",  header=None)
x = data.iloc[:, 0:-1]
y = data.iloc[:,-1]
datatesting = pd.read_csv("TestingData.txt",  header=None)

Reading from processed linear programming files

In [5]:
data = pd.read_csv("finaloutput.txt",  header=None)
data.head()
x = data.iloc[:, 1:-1]
y = data.iloc[:,-1]
readvaluesout = pd.read_csv("finaltestingoutput.txt",  header=None)
datatesting = readvaluesout.iloc[:,1:]

# Split the data and format it so it is ready for the SVM to learn

In [11]:
xf, xt = np.vsplit(x, 2)
yf, yt = np.split(y, 2)
Xt_train, Xt_test, Yt_train, Yt_test = train_test_split(xt, yt, test_size=0.5)
Xf_train, Xf_test, Yf_train, Yf_test = train_test_split(xf, yf, test_size=0.5)
frames = [Xf_train, Xt_train]
X_train = pd.concat(frames)
frames = [Xf_test, Xt_test]
X_test = pd.concat(frames)
frames = [Yf_train, Yt_train]
Y_train = pd.concat(frames)
frames = [Yf_test, Yt_test]
Y_test = pd.concat(frames)

# SVM Training 


This trains the svm for the set of data given above and outputs the best hyper parameters for these values

In [12]:
clf = svm.SVC()
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(clf, param_grid, refit = True, verbose = 3)
  
grid.fit(X_train, Y_train)
print(grid.best_params_)
print(grid.best_estimator_)

[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.946, total=   1.4s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.907, total=   1.4s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.864, total=   1.4s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.875, total=   1.4s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.921, total=   1.4s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] ........... C=10, gamma=1, kernel=rbf, score=0.942, total=   0.8s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] ........... C=10, gamma=1, kernel=rbf, score=0.954, total=   0.8s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] .

[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:  1.6min finished


{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


This gives the accuracy score of the system. Make sure to copy the best parameters from above and input them in the svm.SVC("")

In [14]:
clf = svm.SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
clf.fit(X_train, Y_train)
Y_predict = clf.predict(X_test)
accuracy = accuracy_score(Y_test, Y_predict)
print(accuracy)

0.9598


# So Far the best results came from using

C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False --------------- BEST RESULT SO FAR

C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False --------------- BEST RESULT SO FAR

# Finally to predict which of the testing data is abnormal and which aren't we can run the block below

Make sure to change the values in the brackets for the most accurate results

In [15]:
clf = svm.SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
clf.fit(x, y)
y_final = clf.predict(datatesting)
print(y_final)

[1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 1 1 1 0 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1 0
 1 1 0 0 1 0 0 1 0 1 1 1 0 0 0 1 0 1 0 1 1 1 1 0 0 0 0 1 0 0 1 1 1 1 0 0 1
 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0]


The most consistent output values were
1 0 0 0 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 0 0 1 1 0 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 0 1 0 0