In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

LOADING DATASET - UCI - BREAST CANCER (WISCONSIN)

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"

names = ['id', 'clump_thickness','uniform_cell_size','uniform_cell_shape','marginal_adhesion',
         'single_epi_size','bare_nuclei','bland_cromatin','normal_nucleoli',
         'mitosis','class']

df = pd.read_csv(url,names=names)

In [None]:
df.head()

In [None]:
df.shape

**Data Preprocessing**

In [None]:
df.drop(['id'],axis = 1,inplace = True)

In [None]:
df.head()

**MISSING VALUES**

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df['class'].value_counts()

In [None]:
df['bare_nuclei'].value_counts()

In [None]:
df[df['bare_nuclei']=="?"]

In [None]:
df[df['bare_nuclei']=="?"].sum()

In [None]:
df.replace("?",np.nan,inplace=True)

In [None]:
df['bare_nuclei'][23]

In [None]:
df.isna().sum()

In [None]:
df.fillna(method='ffill',inplace=True)

In [None]:
df.isna().sum()

In [None]:
df['bare_nuclei'] = df['bare_nuclei'].astype('int64')

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
sns.distplot(df['class'],kde = True)

In [None]:
sns.set_style("darkgrid")
df.hist(figsize=(30,30))
plt.show()

In [None]:
scatter_matrix(df,figsize=(20,20))
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(df,orient='h')

In [None]:
sns.pairplot(df,diag_kind='kde')

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(15,15))
cor = df.corr()
sns.heatmap(cor,annot=True)
plt.title("Correlation b/w different values")
plt.show()

In [None]:
cor_target = abs(cor["class"])

relevant_features = cor_target[cor_target>0]
relevant_features

#Train Test Split model

In [None]:
Y = df['class'].values
X = df.drop('class',axis = 1).values

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30,random_state=21)

Model Selection

In [None]:
scoring = 'accuracy'

models = []
models.append(("CART", DecisionTreeClassifier()))
models.append(("SVM", SVC()))
models.append(("NB", GaussianNB()))
models.append(("KNN", KNeighborsClassifier()))

#evaluate
results = []
names = []

for name,model in models:
  kfold = KFold(n_splits = 10)
  cvresult = cross_val_score(model,X_train,y_train,cv=kfold,scoring = scoring)
  results.append(cvresult)
  names.append(name)
  msg = "For %s Model:Mean accuracy is %f(Std accuracy is %f)" %(name,cvresult.mean(),cvresult.std())
  print(msg)

#Validation test

In [None]:
for name,model in models:
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  print("Model name:",name)
  print("Accuracy Score:", accuracy_score(y_test,predictions))
  print("Classification Report", classification_report(y_test, predictions))

In [None]:
fig = plt.figure(figsize=(10,10))
fig.suptitle("Performance Comparison of the Models")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

#Support Vector Machines - Best performance


In [None]:
clf = SVC()

clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)

print("Accuracy of SVM is: ", accuracy)

In [None]:
predict = clf.predict(X_test)
predict

In [None]:
df.info()

In [None]:
df.tail()

In [None]:
example_measures = [[4,2,1,1,1,2,3,2,1]]

prediction = clf.predict(example_measures)
print("The Class is (2 for Benign and 4 for Malign): ", prediction)

In [None]:
example_m2 = [[5,10,10,3,7,3,8,10,2]]
prediction = clf.predict(example_m2)
print("The Class is: ", prediction)	

#Pickle file getting downloaded

In [None]:
import pickle

pickle.dump(clf,open('modelsvm.pkl','wb'))

model = pickle.load(open('modelsvm.pkl','rb'))
print(model.predict([[4,2,1,1,1,2,3,2,1]]))