# Exploratory Data Analysis

* Author: 
* Date created: 
* Project: 
* Study: 
* Associated Files:

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats

### Load data file

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
#Read uploaded csv from local drive into colab notebook with io
import io

df = pd.read_csv(io.BytesIO(uploaded['Your File name.csv']))

### Examine the dataset

In [None]:
#Examine first 5 rows
df.head()

In [None]:
#Run descriptive stats on numeric columns
df.describe()

In [None]:
#Examine datatypes in dataset
df.dtypes

In [None]:
#Look for number of null values in each column
df.isnull().sum()

If there are missing values:
* Drop the column if there are more than 25% and you can't immute them
* If immutation is an option:
  * Use the median
  * Use the mean
  * Immute random integer from within the range of values from the dataset

# Optional code block for immutation of a null value with a random sample
## Function - Impute na

In [None]:
def impute_na(data,variable):
  #function to fill na with a random sample
  df = data.copy()

  #random sampling
  df[variable+'_random'] = df[variable]

  #extract the random sample to fill the na values
  random_sample = df[variable].dropna().sample(df[variable].isnull().sum(), random_state=0)

  #pandas needs to have the same index to merge datasets
  random_sample.index = df[df[variable].isnull()].index
  df.loc[df[variable].isnull(), variable+'_random'] = random_sample

  return df[variable+'_random']

##Function - Diagnostic plots

In [None]:
def diagnostic_plots(df, variable):
  #function to plot a histogram and Q-Q plot, side-by-side, and gives the skew and kurtosis for a variable

  plt.figure(figsize=(15,6))
  plt.subplot(1,2,1)
  df[variable].hist()

  plt.subplot(1,2,2)
  stats.probplot(df[variable], dist="norm", plot=plt)

  plt.show()

  #Print the skew and kurtosis of the distribution below the plots, can change decimal precision as needed
  print('Skew is', df[variable].skew().round(decimals=3), 'and kurtosis is', df[variable].kurtosis().round(decimals=3))

##Run Diagnostic Plot function on all numeric variables

In [None]:
diagnostic_plots(df,'Your variable')

# Transformations for non-normal distributions

In [4]:
def transformation_plots(df, variable):
  df[variable+'_log'] = np.log(df[variable]+1)
  df[variable+'_sqrt'] = df[variable]**(1/2)
  df[variable+'_cbrt'] = df[variable]**(1/3)
  df[variable+'_boxcox'], param = stats.boxcox(df[variable])
  
  fig = plt.figure(figsize=(16,19))
  fig.suptitle('Transformation Plots for {}'.format(variable), fontsize=18, y=0.93)

  #Plot 1
  plt.subplot(3,2,1)
  df[variable].hist()
  plt.grid(False)
  plt.title('Original Data', fontsize=14)
  plt.xlabel('Skew is {} and Kurtosis is {}'.format(df[variable].skew().round(decimals=3),df[variable].kurtosis().round(decimals=3)), fontsize=12)

  #Plot 2
  plt.subplot(3,2,2)
  stats.probplot(df[variable], dist="norm", plot=plt)
  plt.title('Q-Q Plot', fontsize=14)

  #Plot 3
  plt.subplot(3,2,3)
  plt.hist(df[variable+'_log'])
  plt.title('Log Transform', fontsize=14)
  plt.xlabel('Skew is {} and Kurtosis is {}'.format(df[variable+'_log'].skew().round(decimals=3),df[variable+'_log'].kurtosis().round(decimals=3)), fontsize=12)

  #Plot 4
  plt.subplot(3,2,4)
  plt.hist(df[variable+'_sqrt'])
  plt.title('Square Root Transform', fontsize=14)
  plt.xlabel('Skew is {} and Kurtosis is {}'.format(df[variable+'_sqrt'].skew().round(decimals=3),df[variable+'_sqrt'].kurtosis().round(decimals=3)), fontsize=12)

  #Plot 5
  plt.subplot(3,2,5)
  plt.hist(df[variable+'_cbrt'])
  plt.title('Cube Root Transform', fontsize=14)
  plt.xlabel('Skew is {} and Kurtosis is {}'.format(df[variable+'_cbrt'].skew().round(decimals=3),df[variable+'_cbrt'].kurtosis().round(decimals=3)), fontsize=12)

  #Plot 6
  plt.subplot(3,2,6)
  plt.hist(df[variable+'_boxcox'])
  plt.title('Box Cox Transformation with lambda {}'.format(param.round(decimals=3)), fontsize=14)
  plt.xlabel('Skew is {} and Kurtosis is {}'.format(df[variable+'_boxcox'].skew().round(decimals=3),df[variable+'_boxcox'].kurtosis().round(decimals=3)), fontsize=12)

In [None]:
transformation_plots(df, 'Your variable')

## Remove unnecessary transformations from dataframe

In [None]:
#Look at dataframe and what columns it has
df.columns

In [None]:
df2_trans = df2.drop(['']) #include columns with unnecessary transformations in square brackets

In [None]:
#Look at dataframe columns again to make sure they are correct
df2_trans.columns

## Export transformed dataset

In [None]:
df2_trans.to_csv (r'your file name.csv', index = None, header=True) #Index = none is important or resulting csv will have extra row with dataframe index values