In [1]:
#import the libraries
import pandas as pd
import numpy as np
from scipy import stats

# Super Market example
- Minimum Shopping amount required = $120

- Sample Mean = $130

- Sample size (n) = 80

- Sample Stdev = $40

Null Hypothesis - Average spending is not significantly higher than $130. Do not launch the program.

Alternate Hypothesis - The average spending is significantly higher than $130. We can go ahead and launch the loyalty program.

In [None]:
#t = (Sample Mean - Pop Mean) / Std Error
t=(130-120)/(40/np.sqrt(80))
t

2.23606797749979

In [None]:
#Cummulative Density Function
#The probability given as output is for the average spent of <=130
#Degrees of freedom (df) = Sample Size minus one = 80 - 1 = 79
stats.t.cdf(t,df=79)

0.9859156837229892

In [None]:
#To calculate the p-value of average spent of >=130
1-stats.t.cdf(t,df=79)

0.014084316277010789

Conclusion: Since we have just 0.01 chance of error, which is less than 0.05 significance level, for the average spend being >$130, the supermarket can go ahead with the launch of the loyalty program.

In this case, p-value < Significance level of 0.05

Hence Alternate Hypothesis is accepted

# Call Center example

Null Hypothesis : The average call duration is still at 4mins , & the variation in the sample is out of random error/chance

Alternate Hypothesis : The average call duration is significantly different than the assumed mean of 4 mins.

In [2]:
#Population_Mean = 4 mins
#Population_StDev= 3 mins
duration=pd.Series([3.7,4.1,3.5,4.2,3.9,4.1,4.2,3.8,3.7,4.6,3.7,4.6,4.0,4.2,3.8,4.4,5.3,6.1,7.2,6.5])

In [3]:
#Sample Size
len(duration)

20

In [4]:
#Sample Mean
duration.mean()

4.4799999999999995

In [5]:
#t=(Sample Mean - Population Mean)/(Std Dev / Sqrt(Sample Size))
t=(4.48 - 4)/(3/np.sqrt(20))
t

0.7155417527999334

In [9]:
LL = 4-t
UL = 4+t
print("Lower limit of the 95% Confidence Interval is", LL)
print("Upper limit of the 95% Confidence Interval is", UL)

Lower limit of the 95% Confidence Interval is 3.2844582472000665
Upper limit of the 95% Confidence Interval is 4.7155417527999335


In [6]:
p_value= 1 - stats.t.cdf(t, df=19)
p_value

0.24148792354598703

Null Hypothesis : Average Call Duration is 4 mins.

Alternate Hypothesis : Average Call Duration is not 4 mins. (Either significantly higher or significantly lower than 4 mins.)

Since the random chance probability (p-value) of call duration being greater than 4.48 mins is greater than 0.025, hence we shall conclude that the call duration has not significantly increased, and we fail to reject the Null Hypothesis, which says that Mean call duration is at 4mins.

The variation that we see in the sample, is merely due to random chance.

In [10]:
'''Assuming that a sample size of 50 calls, got an average of 5.1 mins per call,
Let's calculate the p-value'''
t= (5.1-4)/(3/np.sqrt(50))
print('t value is',t)
LL = 4-t
UL = 4+t
print("Lower limit of the 95% Confidence Interval is", LL)
print("Upper limit of the 95% Confidence Interval is", UL)

t value is 2.5927248643506733
Lower limit of the 95% Confidence Interval is 1.4072751356493267
Upper limit of the 95% Confidence Interval is 6.592724864350673


In [11]:
p=1-stats.t.cdf(t,df=49)
p

0.0062585271225307215

Since the p-value is lesser than 0.025, we shall conclude that Alternate Hypothesis will be accepted.

Average call duration has significantly increased, and some action must be taken.

Let's say, if I make 100 samples of 50 calls each, & in each of the 100 samples, I see that the mean duration is always greater than 4 mins, then I will be forced to believe that the mean has actually increased, and we must train our call centre executives.

But if in those 100 samples, some of them have mean of less than 4 mins, whereas others have greater than 4 mins, then we must not worry about the situation.

In [12]:
'''Testing the mean call duration when it is lesser than the pop mean,
take the direct probability, without subtracting from 1
'''
#Pop Mean = 4 mins
#Population Std Dev = 3 mins
#Sample Mean = 1.5 mins
#n = 20 calls

#Null Hypothesis - The average call duration has not decreased, any variation is due to random chance error.
#Alternate Hypothesis - The average call duration has decreased significantly.

t= (1.5-4)/(3/np.sqrt(20))
print(t)
stats.t.cdf(t,df=19)

#Since the p-value is less than 0.025, we shall conclude that average call duration has decreased significantly
#The call centre manager will have to check the process and take corrective actions.

-3.7267799624996494


0.0007148636846156863

In [None]:
#Testing the mean call duration when it is greater than the pop mean take probability subtracting from 1
t= (5.6-4)/(3/np.sqrt(50))
print(t)
1 - stats.t.cdf(t,df=49)

Since the p-value is lesser than 0.05, we shall conclude that the Alternate Hypothesis is true, and that the call duration has significantly increased.

In [None]:
#For a 2-tailed test, always compare the p-value calculated with half of the significance level
#So if we decide upon a significance of 5% = 0.05
0.05/2

Since the p-value>0.025 , we will conclude the Null Hypothesis to be true.

Assuming the significance level of 5%, we shall compare this p value with 0.025 instead of 0.05, since it is a 2 tailed test.

Assuming the significance level of 10%, we shall compare this p value with 0.05 instead of 0.10, since it is a 2 tailed test.

# One sample test

Is there evidence that the mean level of Salmonella in the ice cream is significantly greater than 0.3 MPN/g?

Null Hypothesis : The mean level of Salmonella in the ice-cream batches is not significantly higher than 0.3 MPN/g, therefore we must continue the production.

Alternate Hypothesis: The mean level of Salmonella in the ice-cream batches is significantly higher than 0.3 MPN/g, therefore we must stop the production.

In [14]:
from scipy import stats
import pandas as pd
data=pd.Series([0.593, 0.142, 0.329, 0.691, 0.231, 0.793, 0.519, 0.392, 0.418])

In [15]:
data.mean()

0.45644444444444443

In [17]:
#This gives one tail test p value
p=stats.ttest_1samp(data,0.3)
print(p)
p=stats.ttest_1samp(data,0.3)[1]
print(round(p,2))
#Since the p-value > 0.05, we may feel that the batch of ice-cream is safe,
#And that the amount of Salmonella is not significantly high.
#But we might need to test again since the p-value is too close to the significance level of 0.05

TtestResult(statistic=2.2050588385131595, pvalue=0.05853032968489765, df=8)
0.06


In [None]:
data.mean()

# Two Sample test

Null Hypothesis : Both the samples are similar

Alternate Hypothesis : Both samples are significantly different from each other

In [18]:
Control = pd.Series([91, 87, 99, 77, 88, 91])   #(Placebo)
Treat = pd.Series([101, 110, 103, 93, 99, 104]) #(Drug)

In [19]:
#T-Test of Independence - To understand difference between 2 different samples/groups
stats.ttest_ind(Control,Treat)

TtestResult(statistic=-3.4456126735364876, pvalue=0.006272124350809803, df=10.0)

In [20]:
stats.ttest_ind(Control,Treat)[1]

0.006272124350809803

Since the p-value<significance level of 5%, we shall conclude alternate hypothesis to be true.

In [None]:
Process_A=pd.Series([89.7,81.4,84.5,84.8,87.3,79.7,85.1,81.7,83.7,84.5])
Process_B=pd.Series([84.7,86.1,83.2,91.9,86.3,79.3,82.6,89.1,83.7,88.5])

In [None]:
print(Process_A.mean())
print(Process_B.mean())

84.24000000000001
85.54


In [None]:
stats.ttest_ind(Process_A,Process_B)

TtestResult(statistic=-0.8815771351054407, pvalue=0.38962608764018414, df=18.0)

Null Hypothesis: Change in Factor X does not help in increasing the yield.

Alternate Hypothesis : The change in factor X helps in increasing the yield significantly.

Since the p-value > 0.05, we shall conclude that the change in factor X does not increase the yield significantly.

# 2 Proportion Test

In [None]:
import numpy as np

In [None]:
'''Data:
Null Hypothesis - Both the populations are the same
Alternate Hypothesis - There is a statistically significant difference between the 2 populations.
'''
n1 = 247
p1 = .368
n2 = 308
p2 = .389

In [None]:
#population1 = np.random.binomial(1, p1, n1)
#population2 = np.random.binomial(1, p2, n2)

population1 = np.random.binomial(1, 0.368, 247)
population2 = np.random.binomial(1, 0.389, 308)

In [None]:
round(0.368*247)

In [None]:
round(0.389*308)

In [None]:
population1.mean()

In [None]:
population2.mean()

In [None]:
population1

In [None]:
pd.Series(population1).value_counts()

In [None]:
population2

In [None]:
pd.Series(population2).value_counts()

In [None]:
print(len(population1))
print(len(population2))

In [None]:
#import statsmodels.api as sm

In [None]:
#sm.stats.ttest_ind(population1, population2)

In [None]:
#Test of Independence
stats.ttest_ind(population1, population2)

Since the p-value is < 0.05, we reject the null, which means, both the populations are statistically significantly different from each other

In [None]:
students=pd.read_csv("https://userpage.fu-berlin.de/soga/200/2010_data_sets/students.csv")

In [None]:
https://brainly.in/question/18672587