> # **Normal Distribution**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Normal Distribution
def pdf(x):
    mean = np.mean(x)
    std = np.std(x)
    y_out = 1/(std * np.sqrt(2 * np.pi))* np.exp(-(x-mean)**2/(2 * std**2))
    return y_out

# To generate an array of x
x = np.arange(-2,2,0.1)
y = pdf(x)

# Plotting the normal_curve/bell_curve or gaussian distribution
plt.style.use('seaborn')
plt.figure(figsize=(5,5))
plt.plot(x,y,color='blue',linestyle='dashed')

plt.scatter(x,y,marker='o',s=25,color='red')

## Normal distribution and its tests

1. import datasets
2. subsetting a dataset
3. visual test for normal distribution
  1. Histogram
  2. qq-norm plot
4. statistical test
  3. 

In [None]:
df = sns.load_dataset('titanic')
df = df[['sex','age','fare']]
df.head()

In [None]:
# Histogram test
sns.histplot(df['sex'])
sns.histplot(df['age'])
sns.histplot(df['fare'])

In [None]:
# qq-plot
# pip install statsmodels
from statsmodels.graphics.gofplots import qqplot

# q-q norm plot
qqplot(df['age'])

# Normality Tests
### There are many statistical tests that we can use to quantify whether a sample of data looks as though it was drawn from a Gaussian distribution.
### Each test makes different assumptions and considers different aspects of the data.
### We will look 3 commonly used tests in this section that you can apply to your own data samples.
1. Shapiro-Wilk Test  (ONLY THIS IS ENOUGH)
2. D Agostino's K^2 Test
3. Anderson-Darling Test
### p <= alpha : Reject HO, Not Normal
### p >  alpha : Fail to reject Ho, Normal

## 1- Shapiro-Wilk Test

In [None]:
from scipy.stats import shapiro

stat,p = shapiro(df['age'])
print('stat = ', stat)
print('p = ', p)
if p> 0.05:
    print('The data is normal')
else:
    print('The data is not normal')


## 2_ D Agostino's K^2 Test

In [None]:
from scipy.stats import normaltest

stat,p = normaltest(df['age'])
print('stat = ', stat)
print('p = ', p)
if p> 0.05:
    print('The data is normal')
else:
    print('The data is not normal')


## 3_ Anderson-Darling Test

In [None]:
from re import S
from scipy.stats import anderson

result = anderson(df['age'])
print('stat=%.3f'% (result.statistic))
for i in range(len(result.critical_values)):
    s1, cv = result.significance_level[i], result.critical_values[i]
    if result.statistic < cv:
        print('The data is normal at the %.1f%% level'% (s1))
    else:
        print('The data is not normal at the %.1f%% level'% (s1))