In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import datasets, linear_model, model_selection, metrics, ensemble
import matplotlib.pylab as plt
import seaborn as sns

In [None]:
#Load in data, create pandas dataframes, combine target and features.

cancer_ds = datasets.load_breast_cancer()
features = pd.DataFrame(data = cancer_ds.data, columns = cancer_ds.feature_names)
target = pd.DataFrame(data = cancer_ds.target, columns = ['diagnosis'])
df = pd.concat([target, features], axis = 1)

In [None]:
#Show column names
df.columns.values

In [None]:
#Get information on data
df.info()

In [None]:
#Replace all spaces in column names with underscores
df.columns = [x.replace(' ', '_') for x in df.columns]

In [None]:
#Plot correlations between all variables
figure = plt.figure(figsize = [7, 7])
ax = plt.imshow(df[df.columns[1:]].corr(), 
                vmin = -1.0, 
                vmax = 1.0, 
                interpolation = 'None', 
                cmap = 'RdYlBu_r')

plt.xticks(np.arange(0, df.shape[1] - 1), df.columns[1:], rotation = 90);
plt.yticks(np.arange(0, df.shape[1] - 1), df.columns[1:]);
plt.colorbar();
plt.grid();

In [None]:
#Divide data into training and testing data (80% / 20% split)
train, test = model_selection.train_test_split(df, train_size = 0.8, random_state = 101)
print('Training data shape = {}'.format(train.shape))
print('Testing data shape = {}'.format(test.shape))

In [None]:
#Training data
y_train = train['diagnosis']
x_train = train
x_train = x_train.drop(labels = ['diagnosis'], axis = 1).apply(stats.zscore)

In [None]:
#Testing data
y_test = test['diagnosis']
x_test = test
x_test = x_test.drop(labels = ['diagnosis'], axis = 1).apply(stats.zscore)

In [None]:
#Run Random Forest to classify 

rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train, y_train)
y_hat = rf_model.predict(x_test)

#Show model accuracy
print(metrics.accuracy_score(y_test, y_hat))
print(metrics.confusion_matrix(y_test, y_hat))

#Plot coefficents
plt.figure()
ax = sns.barplot(x = np.arange(0, rf_model.feature_importances_.shape[0]), 
            y = rf_model.feature_importances_);
plt.xticks(np.arange(0, rf_model.feature_importances_.shape[0]), 
           x_test.columns, 
           rotation = 90);

In [None]:
#Repeat with standard logistic regression 
log_reg = linear_model.LogisticRegression()
log_model = log_reg.fit(x_train, y_train)
y_hat = log_model.predict(x_test)
log_model.class_weight

#Show model scores
print(metrics.accuracy_score(y_test, y_hat))
print(metrics.confusion_matrix(y_test, y_hat))

coef = log_model.coef_[0]

#Plot coeffiencients
plt.figure()
ax = sns.barplot(x = np.arange(0, coef.shape[0]), 
                 y = coef);
plt.xticks(np.arange(0, coef.shape[0]), 
           x_test.columns, 
           rotation = 90);