# <center> DATA PROCESSING AND VISUALIZATION IN PYTHON <br/><br/> CSCAR WORKSHOP <br/><br/> 06/07/2017
## <center> Marcio Duarte Albasini Mourao

# <center> Setup for Anaconda / Jupyter Notebook

<ul>
    <li>Go to the page https://marcio-mourao.github.io/</li>
    <li>Download the materials under "Blablabla" to your "username/Documents"</li><br/>
    
    <li>Click the Windows button (Bottom Left Corner)</li>
    <li>Click "All apps"</li>
    <li>Click "Anaconda3 (64-bit)"</li>
    <li>Click "Anaconda Prompt" </li>
    <ul>
        <li>Enter "conda update pandas"</li>
        <li>Enter "conda update matplotlib"</li>
        <li>Enter "conda update scikit-learn"</li>
    </ul><br/>
    
    <li>Click the Windows button (Bottom Left Corner)</li>
    <li>Click "All apps"</li>
    <li>Click "Anaconda3 (64-bit)"</li>
    <li>Click "Jupyter Notebook" </li><br/>
    <li>Upload 'hospital.csv' (may not be necessary)</li>
    <li>Click "Workshop.ipynb" (this should open a new tab in the browser)</li>
</ul>

# <center> Introduction

<ul>
  <li>Please, sign up the sheet! </li>
  <li>Don't forget to go to: http://cscar.research.umich.edu/ to know what we're offering!</li>
  <li>Any questions/feedback, you can send an email to <a href="mailto:mdam@umich.edu" target="_top">Marcio.</a>
</ul>

# <center> Summary of this workshop

<ul>
  <li>Summary of Python Data Types</li>
  <li>Pandas Dataframes</li>
  <ul>
      <li>Indexing and Slicing Data</li>
      <li>Applying Functions, Grouping and Merging Data</li>
      <li>Visualizing Data</li>
  </ul>
  <li>Machine Learning (if we have the time) </li>
</ul>



# <center> References

<ul>
  <li>https://www.continuum.io/anaconda-overview</li>
  <li>http://www.numpy.org/</li>
  <li>http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html</li>
  <li>http://matplotlib.org/</li>
  <li>http://pandas.pydata.org/pandas-docs/stable/10min.html</li>
</ul>

## Imports relevant packages for this session

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# <center> Summary of Python Data Types

## Python Simple Data Types
##### Integers
##### Floats
##### Strings
##### Booleans

## Python Data Structures

### Lists

In [None]:
example_list = [2,4,'fg',8,[3,4]]
print(example_list)
print(example_list[0])
print(example_list[2:4])
print(example_list[-2])
example_list[2]=20
print(example_list)
print(example_list[4][0])

### Tuples

In [None]:
example_tuple = (2,4,6,8,10)
print(example_tuple)
print(example_tuple[1])
#example_tuple[2]=20 #this should produce an error

### Dictionary

In [None]:
example_dictionary = {'A':20,'B':40,'C':60}
print(example_dictionary)
print(example_dictionary['B'])
example_dictionary['C']=100
print(example_dictionary)
print(example_dictionary.keys())
print(example_dictionary.values())
aux=example_dictionary.values()

### Numpy arrays

In [None]:
example_array = np.array([2,4,'4',8,10])
print(example_array)
print(example_array[0])
print(example_array[2:4])
print(example_array[-2])
example_array[2]=20
print(example_array)

### Pandas Series
#### A one dimensional labeled array

In [None]:
example_dictionary = {'A':20,'B':40,'C':60}
example_series = pd.Series(example_dictionary)
print(example_series)
print(example_series[0])
print(example_series['B':])

### Pandas Dataframes, a two-dimensional labeled data structure with columns of potentially different types

In [None]:
d=[['df',1.0],
   ['as',3],
   ['bq',5]]

print(type(d))
example_series = pd.DataFrame(d,index=['Row1','Row2','Row3'],columns=['Column1','Column2'])
print(example_series)
example_series.dtypes

# <center> Pandas dataframes

In [None]:
#Displays signature of the function
?pd.read_csv

In [None]:
#Defines column names to read from the hospital.csv file and creates a dataframe with the data named 'patients'
col_names=['id','name','sex','age','wgt','smoke','sys','dia']
patients = pd.read_csv('hospital.csv',usecols=col_names)
patients

In [None]:
#Obtains the number of lines and columns of the dataframe
patients.shape

In [None]:
#Obtains the dataframe main types
patients.dtypes

In [None]:
#Displays first lines of the dataframe
patients.head()

In [None]:
#Displays last lines of the dataframe
patients.tail(3)

In [None]:
#Returns a numpy array with the index values
patients.index.values

In [None]:
#Returns a numpy array with the column values
patients.columns.values

In [None]:
#Returns all the values on the dataframe on a bidimensional numpy array
patients.values[0:5]

In [None]:
#Provides a statistical summary of the patients data
patients.describe()

In [None]:
#Provides a statistical summary of the patients data
patients.describe(include='all')

In [None]:
#Sorts the data along the specified axis
patients.sort_index(axis=1).head()

In [None]:
#Sorts the data by age and then sex in a specified order
patients.sort_values(by=['age','sex'],ascending=[False,False]).head(10)

In [None]:
#Deidentifies the data by removing the 'name column on the dataframe
patients=patients.drop(['name'],axis=1)
patients.head()

In [None]:
#Sets the row index of the dataframe equal to the values on the 'id column
patients=patients.set_index(np.array(patients['id']))
patients.head()

In [None]:
#Removes the 'id column from the dataframe
patients=patients.drop('id',axis=1)
patients.head()

In [None]:
#Obtains the main types on the dataframe
patients.dtypes

In [None]:
#Replace int by characters in the 'smoke column
patients['smoke']=patients['smoke'].astype('object')
patients.head()

In [None]:
#Obtains the main types on the dataframe
patients.dtypes

In [None]:
#Summarizes just the column 'smoke
patients['smoke'].describe()

In [None]:
#Displays the first values of column 'sex
patients['sex'].head()

In [None]:
#Displays the first values of column 'sex
patients.sex.head()

## Indexing and Slicing

In [None]:
#Displays the first three lines of the dataframe
patients[:3]

In [None]:
#Displays two specific lines and columns of the data (two ways of going about it)
print(patients.loc[patients.index[[0,1]],['sex','smoke']])
print(patients.iloc[[0,1],patients.columns.get_indexer(['sex','smoke'])])

In [None]:
#Displays lines 2,3,4 and columns 0,1 of the dataframe
patients.iloc[2:5,:2]

In [None]:
#Check which patients are over the age of 48 (can be used as a boolean index)
patients.age>48

In [None]:
#Creates a boolean index and uses it to identify those with age greater than 48
patients.loc[patients.age>48,:]

## Applying Functions, Grouping and Merging Data


In [None]:
#Displays the first few lines of the dataframe
patients.head()

In [None]:
#Summarize the dataframe
patients.describe(include='all')

In [None]:
#Obtains the mean of each one of the numerical columns on the dataframe
patients.mean(axis=0)

In [None]:
#Obtains the mean of each one of the numerical columns on the dataframe
patients.iloc[:,2:].apply(np.mean,axis=0)

In [None]:
#Obtains the cumulative sum along the columns
patients.iloc[:,2:].apply(np.cumsum).head()

In [None]:
#Obtains the difference between the max and min for each one of the columns
patients.iloc[:,2:].apply(lambda x: x.max() - x.min())

In [None]:
#Confirms the difference above for the column age
patients.age.max()-patients.age.min()

In [None]:
#Groups the data by 'sex and obtains the mean of each column for each group
patients.groupby('sex').mean()

In [None]:
#Groups the data by 'sex and then smoke and obtains the mean of each column for each group
patients.groupby(['smoke','sex']).std()

In [None]:
patients.groupby(['smoke','sex']).apply(np.mean)

In [None]:
#Creates an array with three random row indexes from the dataframe and duplicates them
sample_index=patients.index.values[np.random.randint(0,len(patients)-1,size=3)]
sample_index

In [None]:
#Creates an array with three elements and duplicates them
new_var=np.array(['T','S','T'])
new_var

In [None]:
#Create a new dataframe with two column variables
new_patients = pd.DataFrame({'A':pd.Categorical(sample_index),'B':pd.Categorical(new_var)})
new_patients

In [None]:
new_patients.dtypes

In [None]:
new_patients=pd.concat([new_patients,new_patients])
new_patients

In [None]:
#Displays the signature for the function merge
?pd.merge

In [None]:
#Merges the data on the original dataframe (left dataframe) with the new dataframe (right dataframe)
patients2=pd.merge(patients,new_patients,how='left',left_index=True,right_on='A')
print(patients2.shape)
patients2.head()

In [None]:
#Writes the new merged dataframe to csv and excel files
patients2.to_csv('newData.csv')
patients2.to_excel('newData.xlsx', sheet_name='Sheet1')

## Visualizing the data

In [None]:
#Creates a boxplot of the numerical columns on the dataframe
patients.boxplot()

In [None]:
#Creates a boxplot grouped by sex
patients.groupby('sex').boxplot()

In [None]:
#Creates a histogram of the numerical data on the dataframe patients
patients.hist(grid='off')

In [None]:
#Creates a histogram of the numerical data on the dataframe 'patients grouped by 'smoke
patients.groupby('smoke').hist(grid='off')

In [None]:
#Displays the histograms of 'weight grouped by 'smoke in the same plot
plt.rcParams.update({'font.size': 20})
plt.figure()
patients.groupby('smoke').wgt.hist()
plt.legend(labels=['Smoke:No','Smoke:Yes'],loc='best')
plt.xlabel('Weight')
plt.ylabel('Frequency')
plt.grid('off')

In [None]:
#Creates a scatter matrix of the numerical data in the dataframe
from pandas.plotting import scatter_matrix
scatter_matrix(patients, alpha=1, figsize=(10, 10), diagonal='hist')

# <center> Machine Learning

In [None]:
#Import modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [None]:
#Print the data types in the dataframe
print(patients.dtypes)

#Encode the two object variables
sex_values=patients.sex.values
smoke_values=patients.smoke.values
patients['sex_enc']=LabelEncoder().fit(sex_values).transform(sex_values)
patients['smoke_enc']=LabelEncoder().fit(smoke_values).transform(smoke_values)

#Print the data types in the dataframe
print(patients.dtypes)

In [None]:
#Define covariates in X and dependent variable in y
X = patients[['sex_enc','age','wgt','sys','dia']]
y = patients.smoke_enc

In [None]:
#Obtain the data for the fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=13)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
#Creates a logistic regression object
model = RandomForestClassifier(n_estimators=10, criterion='gini')

#Fit to the data
model.fit(X_train, y_train)

In [None]:
#Obtain predictions
y_pred = model.predict(X_test)
print('Predicted smoking status: \n', y_pred)

In [None]:
#Obtains accuracy score
print('Model score: ', metrics.accuracy_score(y_test, y_pred))

In [None]:
#Capture feature importance from the RF model
feature_imp=model.feature_importances_

#Create plot of feature importance
positions = np.arange(5)
plt.barh(positions, feature_imp, align='center')
plt.xlabel("Feature Importances")
plt.ylabel("Features")
plt.yticks(positions, ('sex_enc','age','wgt','sys','dia'))
plt.grid(True)

In [None]:
#KFolds and Cross_val_scores
kf = KFold(n_splits=10, shuffle=True)
print('Cross validation score: ', cross_val_score(model, X, y, cv=kf).mean())