In [94]:
import pandas as pd 
import csv
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 

'''Stats'''
import scipy
from scipy import stats
from scipy.stats import ttest_ind

import random

In [2]:
df = pd.read_excel('D:\\Bank Data running case study_Advanced Hypothesis Testing.xlsx', sheet_name=0)

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,Random,Select
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,17752.0,Yes
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,6704.0,No
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,43303.0,Yes
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,30188.0,Yes
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,5018.0,No


In [5]:
#Converting target labels into numerical binary representaion
cat = {'Yes': 1, 'No': 0}
df['Select'] = df['Select'].map(cat)

## Inferential Statistics

### Question 1 
What is the probability that a Random customer will end up signing for a deposit?

In [111]:
p_yes = df['y'].value_counts()
p_yes = p_yes[1]

signed_customer = round(100 * (p_yes/len(df)), 2)
print('The probablity that a random customer will end up signing for a deposit is {} %'.format(signed_customer))

The probablity that a random customer will end up signing for a deposit is 11.7 %


### Question 2

What is the probability that a random call will last for less than 200 seconds?


In [110]:
calls_less_than = df[df['duration'] <= 200]
calls_less_than = round(100 * (len(calls_less_than)/len(df)),2)
print('The probability that a random call will last for less than 200 seconds is {} %'.format(calls_less_than))

The probability that a random call will last for less than 200 seconds is 55.01 %


### Question 3 

What is the probability that a random customer will have both a housing loan and a personal loan? 


In [109]:
#First, lets find out the percentage of people who 
personnel_loan = df['loan'].value_counts()
personnel_loan = round(100 * (personnel_loan[1]/len(df)), 2)
print('The probability that a random customer will have a personnel loan is {}%'.format(personnel_loan))

#Do the same with house loan
house_loan = df['housing'].value_counts()
house_loan = round(100 * (house_loan[0]/len(df)), 2)
print('The probability that a random customer will have a housing loan is {}%'.format(house_loan))

#The total number of people who have both of them
personnel_and_house_loan = round((house_loan * personnel_loan)/100, 2)
print('The total percentage for both types is {}%'.format(personnel_and_house_loan))

The probability that a random customer will have a personnel loan is 16.02%
The probability that a random customer will have a housing loan is 55.58%
The total percentage for both types is 8.9%


### Question 4

What is the probability that a random customer will have a personal loan given that he already has a housing loan? 

In [108]:
#So first we need to find out the total number of people who already have housing loans and than filter out those people 
#who have personnel loan within that data 

customers_with_housing_loan = df[df['housing'] == 'yes']
perssonel_loans_under_hloan = customers_with_housing_loan[customers_with_housing_loan['loan'] == 'yes']

conditional_prob_personnel_house = round(100 * (len(perssonel_loans_under_hloan)/len(customers_with_housing_loan)),2)
print('Conditional probablity that a random customer will have a personal loan given that he already has a housing loan is {}%'.format(conditional_prob_personnel_house))
                                         

Conditional probablity that a random customer will have a personal loan given that he already has a housing loan is 17.38%


### Question 5

What is the probability that a customer will sign up for the term deposit given that he has responded negatively to the last campaign? 


In [55]:
df['default'].value_counts()

no     44396
yes      815
Name: default, dtype: int64

In [52]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,Random,Select
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,17752.0,1
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,6704.0,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,43303.0,1
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,30188.0,1
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,5018.0,0


## Hypothesis Testing 

We would like to know if the effects we see in the sample(observed data) are likely to occur in the population. 

The way classical hypothesis testing works is by conducting a statistical test to answer the following question:
> Given the sample and an effect, what is the probability of seeing that effect just by chance?

Here are the steps on how we would do this

1. Compute test statistic
2. Define null hypothesis
3. Compute p-value
4. Interpret the result

If p-value is very low(most often than now, below 0.05), the effect is considered statistically significant. That means that effect is unlikely to have occured by chance. The inference? The effect is likely to be seen in the population too. 

This process is very similar to the *proof by contradiction* paradigm. We first assume that the effect is false. That's the null hypothesis. Next step is to compute the probability of obtaining that effect (the p-value). If p-value is very low(<0.05 as a rule of thumb), we reject the null hypothesis. 

### Question 6

Is an observed sample mean statistically significantly different from an  expected population mean? Lets take 
the age feature as an example for this test. 

So, we want to know weather the average age in the sample is either different from the population or not.

In [107]:
t , p = stats.normaltest(sample_age)
alpha = 0.05
print('The p-value is {} %'.format(p))

if p < alpha:
    print('The null hypothesis can be rejected and we conclude that the sample data has been taken from a normal distributed data')
else:
    print('The null hypothesis cannot be rejected and we conclude that the sample data has not been taken from a normal distributed data')
    
#The p value is less than alpha value which is generally 0.05%, therefore we can reject the null hypothesis and conclude 
#that the sample data has been taken from a normal distributed data.

The p-value is 0.004007570915584058 %
The null hypothesis can be rejected and we conclude that the sample data has been taken from a normal distributed data


In [89]:
#Picking a random sample from the population
#the sample size is 200
#If you give it a bigger sample size like 2000 or more than we will end up with the same mean,
sample_age = df['age'].sample(frac=1).tolist()
sample_age = random.sample(sample_age, 100)

#find out the mean of both population and sample
sample_age_mean = np.mean(sample_age)
population_age_mean = df['age'].mean()

print('The average age of sample is {} %'.format(sample_age_mean))
print('The average age of population is {} %'.format(population_age_mean))

The average age of sample is 41.34 %
The average age of population is 40.93621021432837 %


<b>Null Hypothesis (H0) : </b> Sample age is same as the from population

<b>Alternate Hypothesis (H1) : </b> Sample age is different from population

Level of significance: 5%

The next step should be to perform T-test and determine the P-Value

In [100]:
#Prob of seeing a sample average of 41.31% or greater from a population with an avg of 40.94
t, p = stats.ttest_1samp(sample_age, population_age_mean)
print("t = ", t, ", p = ", p)

t =  0.38372430782843464 , p =  0.702005646628867


p-value is the probability that the effective size was by chance. And here, p-value is almost 0.70.

Therefore,we fail to reject Null Hypothesis and conclude that <b> sample age is same as the population age </b> 