<a href="https://colab.research.google.com/github/mominaamer/Machine-Learning-Projects/blob/main/breast_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor, NeighborhoodComponentsAnalysis
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier

In [None]:
cancer = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")
df = cancer.copy()
df.head()

In [None]:
df.drop(columns=["id", "Unnamed: 32"], axis = 1, inplace=True)

# Fill Missing Value

In [None]:
df.isnull().sum()

# EXAMINE TARGET VARIABLE AND LABEL ENCODER

In [None]:
sns.countplot(df["diagnosis"], )

In [None]:
le = LabelEncoder()
df["Diagnosis"] = le.fit_transform(df["diagnosis"])
df.drop(columns=["diagnosis"], axis=1, inplace=True)
df.head()

# EDA

In [None]:
corr_matrix = df.corr()
plt.figure(figsize=(15,15))
plt.title("Correlation Between Features")
sns.heatmap(corr_matrix, annot=True, fmt=".2f")

In [None]:
threshold = 0.7
filter = np.abs(corr_matrix["Diagnosis"]) > threshold
corr_features = corr_matrix.columns[filter].tolist()
plt.title("CORRELATION BETWEEN FEATURES (CORR > 0.7)")
sns.heatmap(df[corr_features].corr(), annot=True, fmt=".2f")

In [None]:
sns.pairplot(df[corr_features], hue="Diagnosis")

In [None]:
df.hist(corr_features, figsize=(10,10));

In [None]:
def OutliersBox(df, nameOfFeature):
    trace0 = go.Box(y = df[nameOfFeature],
                    name = "All Points",
                    jitter = 0.3,
                    pointpos = -1.8,
                    boxpoints = "all")
    trace1 = go.Box(y = df[nameOfFeature],
                    name = "Only Whiskers",
                    boxpoints = False)
    trace2 = go.Box(y = df[nameOfFeature],
                    name = "Suspected Outliers",
                    boxpoints = "suspectedoutliers",
                    marker = dict(color = 'rgb(8,81,156)',
                                outliercolor = 'rgba(219, 64, 82, 0.6)', line = dict(outliercolor = 'rgba(219, 64, 82, 0.6)',
                                                                                     outlierwidth = 2)),
                    line = dict(color = 'rgb(8,81,156)') )
    trace3 = go.Box(y = df[nameOfFeature],
                    name = "Whiskers and Outliers",
                    boxpoints = "outliers")
    
    data_ = [trace0, trace1, trace2, trace3]
    layout_ = go.Layout(
        title = "{} Outliers".format(nameOfFeature)
    )
    fig = go.Figure(data=data_, layout = layout_)
    py.iplot(fig, filename = "Outliers")

In [None]:
OutliersBox(df, corr_features[0])

In [None]:
OutliersBox(df, corr_features[1])

In [None]:
OutliersBox(df, corr_features[2])

In [None]:
OutliersBox(df, corr_features[3])

In [None]:
OutliersBox(df, corr_features[4])

In [None]:
OutliersBox(df, corr_features[5])

In [None]:
OutliersBox(df, corr_features[6])

In [None]:
OutliersBox(df, corr_features[7])

In [None]:
y = df.Diagnosis
X = df.drop(["Diagnosis"], axis=1)
columns = X.columns.tolist()
clf = LocalOutlierFactor()
y_pred_outlier = clf.fit_predict(X)
X_score = clf.negative_outlier_factor_
outlier_score = pd.DataFrame()
outlier_score["score"] = X_score
threshold = -1.75
filter_outlier = outlier_score["score"] < threshold
outlier_index = outlier_score[filter_outlier].index.tolist()

In [None]:
plt.figure(figsize=(14,8))
plt.scatter(X.iloc[outlier_index,0], X.iloc[outlier_index,1], color="blue", s=50,
            label="Outliers")
plt.scatter(X.iloc[:,0], X.iloc[:,1], color="k", s=3, label="Data Points")

radius = (X_score.max() - X_score) / (X_score.max() - X_score.min()) 
outlier_score["radius"] = radius
plt.scatter(X.iloc[:,0], X.iloc[:,1], s=1000*radius, edgecolors="r", 
            facecolors="none", label="Outlier Scores")
plt.legend()
plt.show()

In [None]:
X = X.drop(outlier_index)
y = y.drop(outlier_index).values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 3, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_df = pd.DataFrame(X_train_scaled, columns=columns)
X_train_df["target"] = y_train
data_melted_2 = pd.melt(X_train_df, id_vars="target",
                        var_name="features",
                        value_name="value")
plt.figure(figsize=(18,10))
plt.title("BOX PLOT AFTER SCALING")
sns.boxplot(x="features", y="value", hue="target", data=data_melted_2)
plt.xticks(rotation=90);

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1],"k--")
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

In [None]:
def KNN_best_params(X_train, X_test, y_train, y_test):
    k_range = np.arange(1,31)
    weight = ["uniform", "distance"]
    params = dict(n_neighbors = k_range, weights = weight)
    
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn, params, cv=10, scoring="accuracy", n_jobs=-1, verbose=2)
    grid.fit(X_train, y_train)
    
    print("Best training score: {} wtih params: {}".format(grid.best_score_,grid.best_params_))
    
    knn = KNeighborsClassifier(**grid.best_params_)
    knn.fit(X_train, y_train)
    y_pred_test = knn.predict(X_test)
    y_pred_train = knn.predict(X_train)
    cm_test = confusion_matrix(y_test, y_pred_test)
    cm_train = confusion_matrix(y_train, y_pred_train)
    
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    y_pred_proba = knn.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    
    print("Test Score: {}, Train Score: {}".format(acc_test,acc_train))
    print("CM TEST")
    print(cm_test)
    print("CM TRAIN")
    print(cm_train)
    print("Precision Score", precision_score(y_test, y_pred_test))
    print("recall Score",recall_score(y_test, y_pred_test))
    print("ROC Score", roc_auc_score(y_test, y_pred_proba))
    plot_roc_curve(fpr, tpr, thresholds)
    return grid


In [None]:
grid = KNN_best_params(X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
def lgbm_best_params(X_train, X_test, y_train, y_test):
    lgbm_params = {"n_estimators" : [100,200,500,1000,2000],
               "subsample" : [0.6,0.8,1.0],
               "max_depth" : [5,10,15,20,25,30,35],
               "learning_rate" : [0.1, 0.01, 0.02, 0.5],
               "min_child_samples" : np.arange(2,50)}
    lgbm = LGBMClassifier()
    random = RandomizedSearchCV(lgbm, lgbm_params, cv=10, random_state=1, n_jobs=-1, verbose=2)
    #grid = GridSearchCV(lgbm, lgbm_params, cv=10, verbose=2, n_jobs=-1)
    random.fit(X_train, y_train)
    
    print("Best training score: {} wtih params: {}".format(random.best_score_,random.best_params_))
    
    lgbm = LGBMClassifier(**random.best_params_)
    lgbm.fit(X_train, y_train)
    y_pred_test = lgbm.predict(X_test)
    y_pred_train = lgbm.predict(X_train)
    
    cm_test = confusion_matrix(y_test, y_pred_test)
    cm_train = confusion_matrix(y_train, y_pred_train)
    
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    y_pred_proba = lgbm.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    
    print("Test Score: {}, Train Score: {}".format(acc_test,acc_train))
    print("CM TEST")
    print(cm_test)
    print("CM TRAIN")
    print(cm_train)
    print("Precision Score", precision_score(y_test, y_pred_test))
    print("recall Score",recall_score(y_test, y_pred_test))
    print("ROC Score", roc_auc_score(y_test, y_pred_proba))
    plot_roc_curve(fpr, tpr, thresholds)
    
    return grid

In [None]:
random_lgbm = lgbm_best_params(X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components=2)
pca.fit(X_scaled)
X_reduced_pca = pca.transform(X_scaled)

In [None]:
pca_data = pd.DataFrame(X_reduced_pca, columns=["p1","p2"])
pca_data["target"] = y
plt.figure(figsize=(14,8))
sns.scatterplot(x="p1", y="p2", hue="target", data=pca_data)

In [None]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_reduced_pca, y, test_size = 0.20, random_state = 3, stratify=y)

In [None]:
grid_pca = KNN_best_params(X_train_pca, X_test_pca, y_train_pca, y_test_pca)

In [None]:
nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=42)
nca.fit(X_scaled, y)
X_reduced_nca = nca.transform(X_scaled)

In [None]:
nca_data = pd.DataFrame(X_reduced_nca, columns=["p1","p2"])
nca_data["target"] = y
plt.figure(figsize=(14,8))
sns.scatterplot(x="p1", y="p2", hue="target", data=nca_data)

In [None]:
X_train_nca, X_test_nca, y_train_nca, y_test_nca = train_test_split(X_reduced_nca, y, test_size = 0.20, random_state = 3, stratify=y)

In [None]:
grid_nca = KNN_best_params(X_train_nca, X_test_nca, y_train_nca, y_test_nca)

# As a result we have 0.99 accuracy score