# Breast Cancer Prediction

## Dataset Attributes

**Features**:
- Clump Thickness
- Uniformity of Cell Shape
- Marginal Adhesion
- Single Epithelial Cell Size
- Bare Nuclei
- Bland Chromatin
- Normal Nucleoli
- Mitoses

**Target**
- Class

-------------------------------------

## Importing required Libraries

In [3]:
import pandas as pd 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, accuracy_score

## Data Processing

In [6]:
# importing dataset
dataset = pd.read_csv('Breast_Cancer_Classification_Dataset.csv')

# splitting dataset into features and target
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# splitting features and target into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

# standardizing the dataset
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)

# Logistic Regression

In [7]:
# fitting model on data
logisticRegression = LogisticRegression()
logisticRegression.fit(X_train, y_train)

# predicting result of test set
y_pred = logisticRegression.predict(X_test)

# metrics
logisticRegression_score = accuracy_score(y_test, y_pred)
print(logisticRegression_score)

0.6666666666666666


# Decision Tree Classifier

In [8]:
# fitting model on data
decision_tree = DecisionTreeClassifier(criterion = 'entropy', splitter = 'random', max_depth = 50, random_state = 0)
decision_tree.fit(X_train, y_train)

# predicting result of test set
y_pred = decision_tree.predict(X_test)

# metrics
DecisionTree_score = accuracy_score(y_test, y_pred)
print(DecisionTree_score)

0.9855072463768116


# Random Forest Classifier

In [9]:
# fitting model on data
random_forest = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
random_forest.fit(X_train, y_train)

# predicting result of test set
y_pred = random_forest.predict(X_test)

# metrics
RandomForest_score = accuracy_score(y_test, y_pred)
print(RandomForest_score)

0.9855072463768116


# Naive Bayes Classifier

In [10]:
# fitting model on data
naiveBayes = GaussianNB()
naiveBayes.fit(X_train, y_train)

# predicting result of test set
y_pred = naiveBayes.predict(X_test)

# metrics
NaiveBayes_score = accuracy_score(y_test, y_pred)
print(NaiveBayes_score)

0.9130434782608695


# Kernel SVM

In [11]:
# fitting model on data
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train, y_train)

# predicting result of test set
y_pred = classifier.predict(X_test)

# metrics
kernelSVM_score = accuracy_score(y_test, y_pred)
print(kernelSVM_score)

0.6666666666666666


# Summary

In [12]:
d = {'Algorithms': ['LogisticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier', 'NaiveBayes', 'KernelSVM'], 
     'Accuracy': [logisticRegression_score, DecisionTree_score, RandomForest_score, NaiveBayes_score, kernelSVM_score]}

print(pd.DataFrame(data=d))

               Algorithms  Accuracy
0      LogisticRegression  0.666667
1  DecisionTreeClassifier  0.985507
2  RandomForestClassifier  0.985507
3              NaiveBayes  0.913043
4               KernelSVM  0.666667
