<a href="https://colab.research.google.com/github/mverschoor-phd/Modeling/blob/main/Python_Plotting_Template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Project Name]

* Author: 
* Date created: 
* Project: 
* Study: 
* Related files: 

In [None]:
#Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.express as px
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()

In [None]:
#Read excel data file
df = pd.read_excel("[file path]")

In [None]:
df.head()

In [None]:
df_map = df.drop(['exp', 'tpp_pass'], axis = 1) #Drop all non-numeric variables

In [None]:
#Look at first 5 rows of new map dataframe
df_map.head()

In [None]:
#Descriptive stats on map dataframe
df_map.describe()

In [None]:
# Interactive plotly bar chart of [var-y], repeat cell for each variable as needed (or write function for plotting each var via loop)
fig = px.bar(df, x="[category-var]", y="[var-y]", labels={'[category-var]':'Category', '[var-y]':'Variable y'},color='[var-y]',)
 
# showing the plot
fig.show()

In [None]:
# Pandas pivot with multiple variables
heatmap_data = pd.pivot_table(df_map, values=['[var-1]', '[var-2]','[var-3]', columns='[category=var]')
heatmap_data.head()

In [None]:
#Rename variables to make heatmap look better (SQL names don't look good in plots)
heatmap_data.rename(index={'[var-1]':'[var-1 name]','[var-2]':'[var-2 name]'},inplace=True)

In [None]:
#Heatmap with dendogram of key donor screen variables with z-score standardization
sns.clustermap(heatmap_data,z_score="int", cmap="Blues", metric="correlation")

In [None]:
# Compute the correlation matrix
corr = heatmap_data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
#Normalize featuers for machine learning
x = df_map
x = StandardScaler().fit_transform(x)

In [None]:
#Check normalized data mean and standard deviation
np.mean(x),np.std(x)

In [None]:
#convert normalized features to dataframe
feat_cols = ['feature'+str(i) for i in range(x.shape[1])]

In [None]:
normalised_data = pd.DataFrame(x,columns=feat_cols)

In [None]:
normalised_data.head()

In [None]:
#Run PCA
pca_data = PCA(n_components=2)

princ_comp_data = pca_data.fit_transform(x)

In [None]:
principal_Df = pd.DataFrame(data = princ_comp_data
             , columns = ['Principal_Component_1', 'Principal_Component_2'])

In [None]:
principal_Df.head()

In [None]:
print('Explained variation per principal component: {}'.format(pca_data.explained_variance_ratio_))

In [None]:
#Scatter plot of components from PCA
plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=20)
plt.ylabel('Principal Component - 2',fontsize=20)
plt.title("Principal Component Analysis",fontsize=20)
targets = ['Y', 'N']
colors = ['b', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = df['tpp_pass'] == target
    plt.scatter(principal_Df.loc[indicesToKeep, 'Principal_Component_1']
               , principal_Df.loc[indicesToKeep, 'Principal_Component_2'], c = color, s = 50)

plt.legend(targets,prop={'size': 15})