In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif

# **Data** **Loading**

In [None]:
data = pd.read_csv("/content/BCD_data.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/content/BCD_data.csv'

In [None]:
print("Dataset shape:", data.shape)

In [None]:
display(data.describe())

In [None]:
data.info()

In [None]:
print("Column names:", data.columns.tolist()) # Use tolist() for better readability

In [None]:
print("Object type columns:", data.select_dtypes(include="object").columns.tolist()) # Use tolist() for better readability


In [None]:
print("Missing values:\n", data.isna().sum())

## **Data Preprocessing**

In [None]:
data = data.drop(columns='Unnamed: 32')

### Dealing with categorical data

In [None]:
data['diagnosis'].unique()

In [None]:
# Convert 'diagnosis' to numerical using one-hot encoding.
data = pd.get_dummies(data=data, columns=['diagnosis'], drop_first=True)

In [None]:
# Rename the encoded column for clarity
data.rename(columns={'diagnosis_M': 'diagnosis'}, inplace=True)

In [None]:
# Convert boolean to int (get_dummies with drop_first=True returns boolean)
data['diagnosis'] = data['diagnosis'].astype(int)

In [None]:
print("\nData after preprocessing:")
display(data.head(4))

# **Visualizations**

###Countplot of the target variable

In [None]:
sns.countplot(x=data['diagnosis'], palette='Set2')
plt.show()
print(f"Total Malignant (1) cases: {(data['diagnosis']==1).sum()}")
print(f"Total Benign (0) cases: {(data['diagnosis']==0).sum()}")

###Correlation matrix and heatmap

In [None]:
data_corr = data.drop(columns='id')
data_corr.corrwith(data_corr['diagnosis']).plot.bar(
    figsize=(20,10), title='Correlation with Diagnosis (1: Malignant)',
    rot=45, grid=True)
plt.ylabel('Correlation Coefficient') # Add y-axis label
plt.show()

In [None]:
#Calculate the full correlation matrix
corr = data_corr.corr()
corr

In [None]:
#Plot the correlation heatmap
plt.figure(figsize=(20,10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()

 ## Drop high multicollinearity

In [None]:
# Calculate the absolute correlation matrix
corr_matrix = data.drop(columns=['id', 'diagnosis']).corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.9 (you can adjust this threshold)
to_drop_multicollinearity = [column for column in upper.columns if any(upper[column] > 0.9)]

print("Features to drop due to high multicollinearity:")
print(to_drop_multicollinearity)

# Drop the identified features from the DataFrame
data_dropped_multicollinearity = data.drop(columns=to_drop_multicollinearity)

print("\nShape of data after dropping highly correlated features:")
print(data_dropped_multicollinearity.shape)

In [None]:
# Calculate the correlation matrix of the data after dropping highly correlated features
corr_matrix_dropped = data_dropped_multicollinearity.drop(columns='diagnosis').corr()

# Plot the correlation heatmap of the remaining features
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix_dropped, annot=False, cmap='coolwarm')
plt.title('Correlation Matrix of Features After Dropping Highly Correlated Features')
plt.show()

# **Applying ML Techniques**


In [None]:
# split data into train and test
# Define features (X) and target (y)
X = data.drop(columns=['diagnosis','id']) # Drop 'id' as it's not a feature
y = data['diagnosis']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

print("\nShape of training data:", X_train.shape)
print("Shape of testing data:", X_test.shape)

#**Feature Scaling**

In [None]:
#Initialize StandardScaler
sc = StandardScaler()

In [None]:
#Fit and transform the training data, transform the testing data
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [None]:
# Display scaled test data
display(pd.DataFrame(X_test_scaled, columns=X_test.columns).head()) # Display scaled data as DataFrame with column names

# **Logistic Regression**

In [None]:
LC = LogisticRegression(random_state=0)
LC.fit(X_train_scaled, y_train) # Train on scaled data
y_pred_lc = LC.predict(X_test_scaled) # Predict on scaled data

# **Random forest**

In [None]:
RF = RandomForestClassifier(random_state=0)
RF.fit(X_train_scaled, y_train) # Train on scaled data
y_pred_rf = RF.predict(X_test_scaled) # Predict on scaled data

# **SVM**

In [None]:
svm_model = SVC(random_state=0)
svm_model.fit(X_train_scaled, y_train) # Train on scaled data
y_pred_svm = svm_model.predict(X_test_scaled) # Predict on scaled data

# **KNN**

In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled, y_train) # Train on scaled data
y_pred_knn = knn_model.predict(X_test_scaled) # Predict on scaled data

# **Naive Bayes**

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train) # Train on scaled data
y_pred_nb = nb_model.predict(X_test_scaled) # Predict on scaled data

# **Model Evaluation**

In [None]:
models = ['Logistic Regression', 'Random Forest', 'SVM', 'KNN', 'Naive Bayes']
predictions = [y_pred_lc, y_pred_rf, y_pred_svm, y_pred_knn, y_pred_nb]
results_list = []

for model_name, y_pred in zip(models, predictions):
    acc = accuracy_score(y_true=y_test, y_pred=y_pred)
    f1 = f1_score(y_true=y_test, y_pred=y_pred)
    prec = precision_score(y_true=y_test, y_pred=y_pred)
    rec = recall_score(y_true=y_test, y_pred=y_pred)
    results_list.append([model_name, acc, f1, prec, rec])

result_df = pd.DataFrame(results_list, columns=['Model','Accuracy','F1 score', 'Precision score','Recall score'])

In [None]:
# Display the performance metrics for all models
display(result_df)

In [None]:
# Display confusion matrix for Logistic Regression as an example
print("\nConfusion Matrix for Logistic Regression:")
cm_lc = confusion_matrix(y_true=y_test, y_pred=y_pred_lc)
print(cm_lc)

In [None]:
# cross validation for all models
print("\nCross Validation Results:")
for model_name, model in zip(models, [LC, RF, svm_model, knn_model, nb_model]):
    accuracies = cross_val_score(estimator=model, X=X_train_scaled, y=y_train, cv=10) # Use scaled training data
    print(f"{model_name} - Accuracy : {accuracies.mean()*100:.2f}%")
    print(f"{model_name} - Std Deviation : {accuracies.std()*100:.2f}%")

#**Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2],
    'solver': ['liblinear', 'saga']
}

In [None]:
# Initialize RandomizedSearchCV for Logistic Regression
random_search_lr = RandomizedSearchCV(estimator=LC, param_distributions=parameters, n_iter=10, scoring='roc_auc', n_jobs=-1, cv=10, verbose=3, random_state=0) # Add random_state for reproducibility

In [None]:
# Fit RandomizedSearchCV on scaled training data
random_search_lr.fit(X_train_scaled, y_train)

In [None]:
print("\nBest estimator from Randomized Search (Logistic Regression):")
display(random_search_lr.best_estimator_)

In [None]:
print("\nBest ROC AUC score from Randomized Search (Logistic Regression):")
print(random_search_lr.best_score_)

In [None]:
print("\nBest parameters from Randomized Search (Logistic Regression):")
print(random_search_lr.best_params_)

# **Final Model (Logistic Regression) - Train with best parameters**

In [None]:
# Initialize and train the final Logistic Regression model using the best parameters found
LRF = LogisticRegression(C=random_search_lr.best_params_['C'],
                           penalty=random_search_lr.best_params_['penalty'],
                           solver=random_search_lr.best_params_['solver'],
                           random_state=0)

LRF.fit(X_train_scaled, y_train) # Train the final model on scaled data

In [None]:
# Evaluate the final model on the test set
y_pred_lrf = LRF.predict(X_test_scaled)

acc_lrf = accuracy_score(y_true=y_test, y_pred=y_pred_lrf)
f1_lrf = f1_score(y_true=y_test, y_pred=y_pred_lrf)
prec_lrf = precision_score(y_true=y_test, y_pred=y_pred_lrf)
rec_lrf = recall_score(y_true=y_test, y_pred=y_pred_lrf)

In [None]:
# Add the final model's results to the result DataFrame
final_model_result = pd.DataFrame([["Final Model LR (Tuned)", acc_lrf, f1_lrf, prec_lrf, rec_lrf]], columns=['Model','Accuracy','F1 score', 'Precision score','Recall score'])
result_df = pd.concat([result_df, final_model_result], ignore_index=True)

In [None]:
# Display the updated result DataFrame including the tuned model
display(result_df)

# **Cross validation**

In [None]:
# Cross validation for the final tuned Logistic Regression model
print("\nCross Validation Results for Final Tuned Logistic Regression Model:")
accuracies_lrf = cross_val_score(estimator=LRF, X=X_train_scaled, y=y_train, cv=10)
print("Accuracy : ",accuracies_lrf.mean()*100,'%')
print("Std Deviation : ",accuracies_lrf.std()*100,'%')

## Feature Selection

In [None]:
# Select top features using f-classif on scaled training data
selector = SelectKBest(score_func=f_classif, k=10)  # You can change k to select a different number of features
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

In [None]:
# Get the selected feature names from the original column names
selected_features = X.columns[selector.get_support()]
print("\nSelected features after applying SelectKBest on scaled data:")
print(selected_features)

### Single Observations

In [None]:
# Demonstrating prediction on a single observation using the final tuned Logistic Regression model
single_obs = [[17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189]]


In [None]:
#Scale the single observation using the same scaler fitted on the training data
single_obs_scaled = sc.transform(single_obs)

In [None]:
# Predict using the final tuned Logistic Regression model
predicted_diagnosis = LRF.predict(single_obs_scaled)

# **Interpret the prediction**


In [None]:
if predicted_diagnosis[0] == 1:
    print("\nPrediction for the single observation: Malignant (1)")
else:
    print("\nPrediction for the single observation: Benign (0)")