# Exploratory Data Analysis with Seaborn

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time

In [None]:
data = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")

## Exploratory Data Analysis

In [None]:
data.head()

In [None]:
col = data.columns
print(col)

In [None]:
y = data.diagnosis
drop_cols = ['Unnamed: 32','id', 'diagnosis']
x = data.drop(drop_cols, axis=1)
x.head()

In [None]:
ax = sns.countplot(y, label="count")
B, M = y.value_counts()
print('number of benign Tumors',B)
print('number of benign Tumors',M)

In [None]:
x.describe()

## Visualizing Standardized Data with Seaborn

##### Standardizing data 

In [None]:
data = x
data_std = (data - data.mean())/data.std()
data.head()

In [None]:
data = pd.concat([y,data_std.iloc[:,0:10]], axis=1)  #our data here is in long format
data = pd.melt(data, id_vars='diagnosis',            # using melt to un pivot it to wide format
              var_name='features',
              value_name='value')
plt.figure(figsize=(20,10))
sns.violinplot(x='features',y='value',hue='diagnosis',data=data, split=True, inner='quart')
plt.xticks(rotation=45);

In [None]:
data = pd.concat([y,data_std.iloc[:,10:20]], axis=1)  #our data here is in long format
data = pd.melt(data, id_vars='diagnosis',            # using melt to un pivot it to wide format
              var_name='features',
              value_name='value')
plt.figure(figsize=(20,10))
sns.violinplot(x='features',y='value',hue='diagnosis',data=data, split=True, inner='quart')
plt.xticks(rotation=45);

In [None]:
data = pd.concat([y,data_std.iloc[:,20:30]], axis=1)  #our data here is in long format
data = pd.melt(data, id_vars='diagnosis',            # using melt to un pivot it to wide format
              var_name='features',
              value_name='value')
plt.figure(figsize=(20,10))
sns.violinplot(x='features',y='value',hue='diagnosis',data=data, split=True, inner='quart')
plt.xticks(rotation=45);

In [None]:
data = pd.concat([y,data_std.iloc[:,0:30]], axis=1)  #our data here is in long format
data = pd.melt(data, id_vars='diagnosis',            # using melt to un pivot it to wide format
              var_name='features',
              value_name='value')
plt.figure(figsize=(30,10))
sns.boxplot(x='features', y = 'value',hue='diagnosis',data=data)
plt.xticks(rotation=45);

## using joint plot for Feature Comparison

In [None]:
sns.jointplot(x.loc[:,'concavity_worst'],
            x.loc[:,'concave points_worst'],
            kind ='regg',
             color='#ce1414');

###### these two fwatures are highly corelated 

## Observing with SwarmPlots

##### gives each individual datapoint

In [None]:
sns.set(style='whitegrid',palette='muted')
data = pd.concat([y,data_std.iloc[:,0:10]], axis=1)  #our data here is in long format
data = pd.melt(data, id_vars='diagnosis',            # using melt to un pivot it to wide format
              var_name='features',
              value_name='value')
plt.figure(figsize=(20,10))
sns.swarmplot(x='features',y='value',hue='diagnosis',data=data)
plt.xticks(rotation=45);

In [None]:
sns.set(style='whitegrid',palette='muted')
data = pd.concat([y,data_std.iloc[:,10:20]], axis=1)  #our data here is in long format
data = pd.melt(data, id_vars='diagnosis',            # using melt to un pivot it to wide format
              var_name='features',
              value_name='value')
plt.figure(figsize=(20,10))
sns.swarmplot(x='features',y='value',hue='diagnosis',data=data)
plt.xticks(rotation=45);

In [None]:
sns.set(style='whitegrid',palette='muted')
data = pd.concat([y,data_std.iloc[:,20:30]], axis=1)  #our data here is in long format
data = pd.melt(data, id_vars='diagnosis',            # using melt to un pivot it to wide format
              var_name='features',
              value_name='value')
plt.figure(figsize=(20,10))
sns.swarmplot(x='features',y='value',hue='diagnosis',data=data)
plt.xticks(rotation=45);

## Corelation Matrix

In [None]:
f , ax =plt.subplots(figsize=(25,16))
sns.heatmap(x.corr(),annot=True,linewidth=.5,fmt='.1f',ax=ax);

## Feature Selection

#### dropping the feaures which are highly correlated keeping one of higlhly correlated

In [None]:
drop_cols = [  'radius_mean', 'perimeter_mean', 'compactness_mean', 
               'concave points_mean', 'radius_se', 'perimeter_se', 
               'radius_worst','perimeter_worst', 'compactness_worst',
               'concave points_worst','compactness_se','concave points_se',
               'texture_worst','area_worst']
df = x.drop(drop_cols,axis=1)
df.head()

In [None]:
f, ax = plt.subplots(figsize=(20,12))
sns.heatmap(df.corr(), annot=True, linewidth=.5, fmt='.1f',ax = ax);

## Classification using XGBoost

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
x_train,x_test,y_train,y_test = train_test_split(df, y, test_size=0.3, random_state=42)

clf_XGB = xgb.XGBClassifier(random_state=42)
clf_XGB = clf_XGB.fit(x_train,y_train)

In [None]:
print('Accuracy:',accuracy_score(y_test, clf_XGB.predict(x_test)))
cm = confusion_matrix(y_test, clf_XGB.predict(x_test))
sns.heatmap(cm, annot=True, fmt='d');


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 

In [None]:
select_feature = SelectKBest(chi2, k=10).fit(x_train,y_train)
print('score List: ', select_feature.scores_)
print('Feature List: ', x_train.columns)

In [None]:
x_train2 = select_feature.transform(x_train)
x_test2 = select_feature.transform(x_test)

clf_XGB2 = xgb.XGBClassifier().fit(x_train2, y_train)

print('Accuracy is :',accuracy_score(y_test, clf_XGB2.predict(x_test2)))
cm2 = confusion_matrix(y_test, clf_XGB2.predict(x_test2))
sns.heatmap(cm, annot=True, fmt='d');

## Feature Extraction using PCA

#### should normalize the features before performing PCA

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.3, random_state=42)

x_train_norm = (x_train - x_train.mean())/(x_train.max()-x_train.min())
x_test_norm = (x_test - x_test.mean())/(x_test.max()-x_test.min())

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(x_train_norm)


plt.figure(1, figsize=(10,8))
sns.lineplot(data=np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('no. of components')
plt.ylabel('cumultive explained variance')