# Hypothesis Testing
**Does education level affect income or not?**

## Preview Data

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st

In [2]:
# import dataset
raw_df = pd.read_csv('marketing_campaign.csv', sep='\t')
raw_df.head() 

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


In [3]:
# filter, get education and income only
df = raw_df[['Education', 'Income']]
df.head()

Unnamed: 0,Education,Income
0,Graduation,58138.0
1,Graduation,46344.0
2,Graduation,71613.0
3,Graduation,26646.0
4,PhD,58293.0


In [9]:
# Populations check 
df.groupby(['Education']).var()

Unnamed: 0_level_0,Income
Education,Unnamed: 1_level_1
2n Cycle,489253800.0
Basic,38876060.0
Graduation,793954200.0
Master,406336400.0
PhD,424894900.0


In [10]:
# Count values
df.groupby(['Education']).count()

Unnamed: 0_level_0,Income
Education,Unnamed: 1_level_1
2n Cycle,200
Basic,54
Graduation,1116
Master,365
PhD,481


## Statistics Descriptive

In [11]:
# Average for each level of education
df.groupby(['Education']).mean()

Unnamed: 0_level_0,Income
Education,Unnamed: 1_level_1
2n Cycle,47633.19
Basic,20306.259259
Graduation,52720.373656
Master,52917.534247
PhD,56145.313929


In [12]:
# Median for each level of education
df.groupby(['Education']).median()

Unnamed: 0_level_0,Income
Education,Unnamed: 1_level_1
2n Cycle,46805.0
Basic,20744.0
Graduation,52028.5
Master,50943.0
PhD,55212.0


In [13]:
# Mode for each level of education
df.groupby(['Education']).agg(pd.Series.mode)

Unnamed: 0_level_0,Income
Education,Unnamed: 1_level_1
2n Cycle,7500.0
Basic,"[7500.0, 20425.0, 22634.0, 24594.0, 24882.0, 2..."
Graduation,7500.0
Master,"[46098.0, 63841.0]"
PhD,35860.0


In [14]:
# Standard deviation for each level of education
df.groupby(['Education']).std()

Unnamed: 0_level_0,Income
Education,Unnamed: 1_level_1
2n Cycle,22119.081838
Basic,6235.066773
Graduation,28177.192681
Master,20157.788029
PhD,20612.979997


## Hypothesis Testing

In [15]:
# Re-group in smaller df
df_2n_Cycle = df[df['Education'] == '2n Cycle']
df_Basic = df[df['Education'] == 'Basic']
df_Graduation = df[df['Education'] == 'Graduation']
df_Master = df[df['Education'] == 'Master']
df_PhD = df[df['Education'] == 'PhD']

In [16]:
# Filtering missing value
df_2n_Cycle = df_2n_Cycle[df_2n_Cycle['Income'].isna() == False]
df_Basic = df_Basic[df_Basic['Income'].isna() == False]
df_Graduation = df_Graduation[df_Graduation['Income'].isna() == False]
df_Master = df_Master[df_Master['Income'].isna() == False]
df_PhD = df_PhD[df_PhD['Income'].isna() == False]

In [23]:
# The Hypothesis 
H_0 = ('Variations in education level do not affect income.')
H_1 = ('Variations in education level affect income.')

In [24]:
# The P-value by using ANOVA
anova = st.f_oneway(df_2n_Cycle['Income'],
                   df_Basic['Income'],
                   df_Graduation['Income'],
                   df_Master['Income'],
                   df_PhD['Income'])

In [25]:
# The result
p_value = anova.pvalue
p_value

1.6677281316366318e-22

### Decision-making

In [26]:
if p_value > 0.05:        
    print(H_0)
else:
    print(H_1)

Variations in education level affect income.


**So the level of education has an effect on income**. By using the ANOVA test, we can see that the pvalue is less than 0.05 and giving the result H1.