***

# 1. Importing Packages and Reading Data  

***


## Importing Packages

**Below is the steps we would go through to load, view and visualize any csv data into a pandas dataframe.**  

In [1]:
import pandas as pd                                 # required to load and read data and put in dataframe.

**We add python packages we require.**  

In [2]:
import numpy as np                                  # apply math functions to arrays
import matplotlib.pyplot as plt                     # features to help with plotting figures
import seaborn as sns                               # data visualization (eg. heatmap)
from IPython.display import display, HTML 
#import wget

## Reading Data 

**To read data in the form of .csv or comma seperated file, you need pd.read_csv(). Delimiter can be set as well whether header or index column is required.**  

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
#cleveland_data = wget.download(url)
df = pd.read_csv(url)
#df = pd.read_csv(url, header = None, index_col = False, delimiter = ',')
#df.columns = ['Age','Sex','Chest_Pain','RestBP','Chol','FBS', 'RestECG', 'MaxHR', 'Exang', 'Oldpeak', 'Slope_ST_seg', 'Ca', 'Thal', 'Num']

df.head()

Unnamed: 0,63.0,1.0,1.0.1,145.0,233.0,1.0.2,2.0,150.0,0.0,2.3,3.0,0.0.1,6.0,0
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0


* * *

# 2. Gathering Basic Info  

* * *

## A:   .info()
**To get basic info from the dataset, we use .info()**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 14 columns):
63.0     302 non-null float64
1.0      302 non-null float64
1.0.1    302 non-null float64
145.0    302 non-null float64
233.0    302 non-null float64
1.0.2    302 non-null float64
2.0      302 non-null float64
150.0    302 non-null float64
0.0      302 non-null float64
2.3      302 non-null float64
3.0      302 non-null float64
0.0.1    302 non-null object
6.0      302 non-null object
0        302 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.1+ KB


## B:   .dtypes()
**To get datatypes of each column, we can use .dtypes**  

In [5]:
print(df.dtypes)                                            #not working

63.0     float64
1.0      float64
1.0.1    float64
145.0    float64
233.0    float64
1.0.2    float64
2.0      float64
150.0    float64
0.0      float64
2.3      float64
3.0      float64
0.0.1     object
6.0       object
0          int64
dtype: object


## C:   .describe()

**To get more details about each column, we can use .describe()**  
  

In [6]:
df.describe()

Unnamed: 0,63.0,1.0,1.0.1,145.0,233.0,1.0.2,2.0,150.0,0.0,2.3,3.0,0
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.410596,0.678808,3.165563,131.645695,246.738411,0.145695,0.986755,149.60596,0.327815,1.03543,1.596026,0.940397
std,9.040163,0.467709,0.953612,17.612202,51.856829,0.353386,0.994916,22.912959,0.470196,1.160723,0.611939,1.229384
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0
50%,55.5,1.0,3.0,130.0,241.5,0.0,0.5,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


## D:   .isnull().sum()
   
**We can check the number of null/empty values a column has by using .isnull().sum()**  


In [7]:
print (df.isnull().sum())

63.0     0
1.0      0
1.0.1    0
145.0    0
233.0    0
1.0.2    0
2.0      0
150.0    0
0.0      0
2.3      0
3.0      0
0.0.1    0
6.0      0
0        0
dtype: int64


## E:   .nunique()
  
**It is useful to see the number of unique values in each column using .nunique()**  

In [8]:
#unique counts per catogory -- issues with num column?
#print(df.apply(lambda x: x.nunique()))                          #this method works for nunique             
print (df.nunique())                                            #working after adjusting panda, conda, switched to py2

63.0      41
1.0        2
1.0.1      4
145.0     50
233.0    152
1.0.2      2
2.0        3
150.0     91
0.0        2
2.3       40
3.0        3
0.0.1      5
6.0        4
0          5
dtype: int64


## F:   .value_counts()


In [9]:
print df.Age.value_counts()

AttributeError: 'DataFrame' object has no attribute 'Age'

***

# 3. Cleaning Data into Readable Format  

***
    
## A.   CSV cleaning


In [None]:
df = pd.read_csv(url, header = None, index_col = False, delimiter = ',')
df.columns = ['Age','Sex','Chest_Pain','RestBP','Chol','FBS', 'RestECG', 'MaxHR', 'Exang', 'Oldpeak', 'Slope_ST_seg', 'Ca', 'Thal', 'Num']


for j in range(len(df['Ca'])):
    if df.iloc[j]['Ca']=='?':
        df.set_value(j,'Ca','0.0')
df['Ca']=df['Ca'].astype(float)

#df[df['Ca']=='?'].Ca

for j in range(len(df['Thal'])):
    if df.iloc[j]['Thal']=='?':
        df.set_value(j,'Thal','0.0')
df['Thal']=df['Thal'].astype(float)
df.head()

## B1:   .dtypes()
    
**Now after cleaning, lets look at the new data types and the mean, std, min, max etc. of all the columns again**

In [None]:
df.dtypes

## B2:   .describe()

In [None]:
display(HTML(df.describe().to_html()))

***

# 4. Manipulating Data  

***

- ## Groupby

**Find trends by grouping data according to 'Num' output (0 - no heart disease, 4 - severe heart disease).** 

In [None]:
group_by_num= df.groupby(["Num","Sex"])
print group_by_num.Age.agg(np.max) #gives mean values of all categories based on chest pain
print group_by_num
for key, item in group_by_num:
    print key
    #display(HTML(group_by_num.get_group(key).to_html()))
    #print group_by_num.get_group(key)

**From .nunique() and data provided able to figure out that predicted heart disease outcome ('Num') has 5 categories (0,1,2,3,4). Group each of the 'Num' outputs and find the means for each category.**

In [None]:
group2 = df.groupby("Num")
print (group2.mean())

**To access information from a specific 'Num' category (eg. 4), use get_group(4)**

In [None]:
group2 = df.groupby("Num")
print(group2.Age.get_group(4), "\n\n")


***

# 5. Data Visualization  

***

**Using matplotlib.pyplot to make bar charts**

## A:   **General Plot**  
  
**Let us try to visualize all the data at once**  

In [None]:
df.plot()
plt.title('Plot of All Data', fontweight='bold')
plt.xlabel('Index')
plt.ylabel('Value')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)    #place legend box to right of plot
plt.savefig('images/all_plot.png', bbox_inches="tight")
plt.show()

## B:   **Log Plot**  

**Plot data on a log scale to identify the scale range of outputs.**

In [None]:
df.plot()
plt.yscale('log')                                                 # makes plot log scale 


plt.title('Log Plot of All Data', fontweight='bold')              # figure features
plt.xlabel('Index')
plt.ylabel('Log Scale')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) 
plt.savefig('images/log_plot.png', bbox_inches="tight")
plt.show()

## C:   **Histogram**  

**Histogram to assess resting blood pressure distribution**

In [None]:
#RestBP distribution of study
x = df['RestBP']
num_bins = 5
n, bins, patches = plt.hist(x, num_bins, alpha=0.5)
plt.title('RestBP', fontweight='bold')
plt.xlabel('RestBP')
plt.ylabel('Count')
plt.savefig('images/histogram.png')
plt.show()

In [None]:
#normalize distribution -- random
age_hist = np.random.normal(size = 1000)
plt.hist(age_hist, bins=30)
plt.ylabel('Random')
plt.show()

## D:   **Scatterplot**  

**Creating scatterplot of maximum heart rate vs. rest blood pressure**

In [None]:
#making markers that vary in size and color; multi-dimensional plot

#scatterplot
x = df['MaxHR']
y = df['RestBP']
plt.title('Rest BP vs. Max HR', fontweight='bold')
plt.xlabel('MaxHR')
plt.ylabel('RestBP')
#plt.errorbar(x,y,linestyle= 'None', marker='s')
plt.scatter(x, y, alpha=0.5)
plt.savefig('images/scatterplot.png')

## E:   **Bar Graph**  

**Plot and organize the different outputs of 'Num' relative to Age. 5 output graphs correlating to 'Num' category 0,1,2,3,4**

In [None]:
for i in range(5):
    group1=group2.Age.get_group(i)
    group1= group1.value_counts().sort_index()
    age_pos = np.arange(len(group1.index))
    plt.figure(figsize=(7,3))
    plt.xlabel('Age')
    plt.ylabel('Count')
    #stri = 'Num output ' +str(i)
    #plt.title(stri)
    plt.title('Age Distribution for Num')
    plt.bar(age_pos,group1.values)
    plt.xticks(age_pos,group1.index, rotation =90)
    plt.show()


## F:   **Heatmap**  

**Create heatmap using .corr() to visualize correlation of all categories. Diagonal should all be 1.0 because comparing against itself.**

In [None]:
#heatmap
f,ax = plt.subplots(figsize=(18, 16))
sns.heatmap(df.corr(), annot=True, linewidths=.8, fmt= '.1f',ax=ax)
plt.savefig('images/heatmap.png')
plt.show()