## Project 3
 ### Team:3
    
    Matt Keeley
    Jenny Yang
    Shay Masood
    Shreyansh Saraiya
    Fatma Butun



# OVERVIEW

### Here we aim to predict whether a breast tumor is benign or malignant based on certain cell features by using Machine Learning algorithm.
### The result will be published on heroku

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier
import joblib
import seaborn as sns

# Read csv and do data cleaning

In [None]:
breast_cancer_df = pd. read_csv("breast_cancer_data.csv")

In [None]:
# breast_cancer_df.tail()

In [None]:
breast_cancer_df.columns

In [None]:
# Drop the null columns where all values are null
breast_cancer_df = breast_cancer_df.dropna(axis='columns', how='all')
# Drop the null rows
breast_cancer_df = breast_cancer_df.dropna()
breast_cancer_df.shape

In [None]:
df_mean = breast_cancer_df[breast_cancer_df.columns[1:11]]
df_mean

In [None]:

sns.pairplot(data=df_mean, hue='diagnosis', corner=True).fig.suptitle('Pairplot for Mean Featrues', fontsize = 20)
plt.savefig('static/images/sns_scatter.jpg')

In [None]:
# define the output as target
target = breast_cancer_df["diagnosis"]
# target.head()

## Group 1: Select every single input column as feature

In [None]:
# define input, begin with all inputs

features = breast_cancer_df[['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']]

In [None]:
## make synthetic data

# # Fit a kernel density model using GridSearchCV to determine the best parameter for bandwidth
# from sklearn.model_selection import GridSearchCV
# from sklearn.neighbors import KernelDensity
# bandwidth_params = {'bandwidth': np.arange(0.01,1,0.05)}
# grid_search = GridSearchCV(KernelDensity(), bandwidth_params)
# grid_search.fit(features)
# kde = grid_search.best_estimator_


In [None]:
# # Generate 100 new sample from this dataset
# synthetic_data = kde.sample(100, random_state=42)

# synthetic_data = pd.DataFrame(synthetic_data, columns = ['radius_mean', 'texture_mean', 'perimeter_mean',
#        'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
#        'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
#        'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
#        'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
#        'fractal_dimension_se', 'radius_worst', 'texture_worst',
#        'perimeter_worst', 'area_worst', 'smoothness_worst',
#        'compactness_worst', 'concavity_worst', 'concave points_worst',
#        'symmetry_worst', 'fractal_dimension_worst'])

In [None]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

In [None]:
# scale the data
X_scaler = StandardScaler().fit(X_train)

In [None]:
# # save the scaler
# filename = 'scaler_allfeatures.sav'
# joblib.dump(X_scaler, filename)

In [None]:
# save the scaler
filename = 'scaler/scaler_allfeatures.sav'
joblib.dump(X_scaler, filename)

In [None]:
# Transform the training and testing data using the X_scaler model
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# y_train

In [None]:
# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

In [None]:
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
encoded_y_train

In [None]:
# Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
# test with one model - Here, Random Classifier 
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, encoded_y_train)
rf.score(X_test_scaled, encoded_y_test)

filename = 'rf.sav'
joblib.dump(rf, filename)

In [None]:
rf.score(X_test_scaled, encoded_y_test)

In [None]:
# # check the importance of the features and remove the least important ones ( result was very similar so I did not do it)
# sorted(zip(rf.feature_importances_, features), reverse=True)

## Group 2:  Select features with selectBest function

In [None]:
#SelectKBest (We will have to change the dataframes that feed into X and y):
from sklearn.feature_selection import SelectKBest, chi2
X = features  #independent columns
y = target    #target column i.e price range
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest()
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(7,'Score'))  #print 10 best features

In [None]:
features_sk = breast_cancer_df[['concave points_worst', 'perimeter_worst', 'concave points_mean',
       'radius_worst', 'perimeter_mean', 'area_worst', 'radius_mean'
       ]]

In [None]:
# split the data with features selected according to selectBest function
X_train_sk, X_test_sk, y_train, y_test = train_test_split(features_sk, target, random_state=42)

In [None]:
# scale the data with features selected according to selectBest function
X_scaler_sk = StandardScaler().fit(X_train_sk)

In [None]:
# # save the scaler_sk
# filename = 'scaler_selectBestFeatures.sav'
# joblib.dump(X_scaler_sk, filename)

In [None]:
# save the scaler_sk
filename = 'scaler/scaler_selectBestFeatures.sav'
joblib.dump(X_scaler_sk, filename)

In [None]:
# Transform the training and testing data using the X_scaler model
X_train_scaled_sk = X_scaler_sk.transform(X_train_sk)
X_test_scaled_sk = X_scaler_sk.transform(X_test_sk)

## Group 3: Select features based on their correlation with each other and with the dignosis as determined in the correlation graph in tableau

In [None]:
features_cs = breast_cancer_df[['perimeter_worst',
'concave points_worst',
'symmetry_worst',
'smoothness_worst',
'compactness_worst',
'texture_worst',
'fractal_dimension_worst'
       ]]

In [None]:
# split the data with features selected according to the correlation graph made in tableau
X_train_cs, X_test_cs, y_train, y_test = train_test_split(features_cs, target, random_state=42)

In [None]:
# scale the data with features selected according to selectBest function
X_scaler_cs = StandardScaler().fit(X_train_cs)

In [None]:
# # save the scaler_sk
# filename = 'scaler_correlationFeatures.sav'
# joblib.dump(X_scaler_cs, filename)

In [None]:
# save the scaler_sk
filename = 'scaler/scaler_correlationFeatures.sav'
joblib.dump(X_scaler_cs, filename)

In [None]:
# Transform the training and testing data using the X_scaler model
X_train_scaled_cs = X_scaler_cs.transform(X_train_cs)
X_test_scaled_cs = X_scaler_cs.transform(X_test_cs)

In [None]:
# Uncomment this code to load the scaler with all features 
# X_scaler = joblib.load("scaler_allfeatures.sav")

In [None]:
# Uncomment this code to load the scaler with some features removed 
# X_scaler_r = joblib.load("scaler_features_removed.sav")

In [None]:
# save train test data set for input with all features, selectBest features and correlation based features as csv file
from numpy import asarray
from numpy import savetxt

savetxt('test_train_data/X_train_scaled.csv', X_train_scaled, delimiter=',')
savetxt('test_train_data/X_test_scaled.csv', X_test_scaled , delimiter=',')
savetxt('test_train_data/y_train_categorical.csv', y_train_categorical , delimiter=',')
savetxt('test_train_data/y_test_categorical.csv', y_test_categorical , delimiter=',')
savetxt('test_train_data/encoded_y_train.csv', encoded_y_train , delimiter=',')
savetxt('test_train_data/encoded_y_test.csv', encoded_y_test , delimiter=',')
savetxt('test_train_data/X_train_scaled_sk.csv', X_train_scaled_sk, delimiter=',')
savetxt('test_train_data/X_test_scaled_sk.csv', X_test_scaled_sk , delimiter=',')
savetxt('test_train_data/X_train_scaled_cs.csv', X_train_scaled_cs, delimiter=',')
savetxt('test_train_data/X_test_scaled_cs.csv', X_test_scaled_cs, delimiter=',')