In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import IPython as ip

In [2]:
mpl.style.use('ggplot')
mpl.rc('figure', figsize=(8.0, 5.5))
mpl.rc('font', family='Noto Sans CJK TC')
ip.display.set_matplotlib_formats('svg')

In [3]:
print(sm.datasets.fair.SOURCE,
      sm.datasets.fair.NOTE)


Fair, Ray. 1978. "A Theory of Extramarital Affairs," `Journal of Political
Economy`, February, 45-61.

The data is available at http://fairmodel.econ.yale.edu/rayfair/pdf/2011b.htm
 ::

    Number of observations: 6366
    Number of variables: 9
    Variable name definitions:

        rate_marriage   : How rate marriage, 1 = very poor, 2 = poor, 3 = fair,
                        4 = good, 5 = very good
        age             : Age
        yrs_married     : No. years married. Interval approximations. See
                        original paper for detailed explanation.
        children        : No. children
        religious       : How relgious, 1 = not, 2 = mildly, 3 = fairly,
                        4 = strongly
        educ            : Level of education, 9 = grade school, 12 = high
                        school, 14 = some college, 16 = college graduate,
                        17 = some graduate school, 20 = advanced degree
        occupation      : 1 = student, 2 = farming, agr

In [4]:
# -> Pandas's Dataframe
df_fair = sm.datasets.fair.load_pandas().data

In [5]:
df = df_fair

df_a = df_fair[df.rate_marriage <= 2][['affairs']]  # <= poor
df_b = df_fair[df.rate_marriage > 2][['affairs']]  # >= fair

# ttest_ind(...) === Student's t-test
# ttest_ind(..., equal_var=False) === Welch's t-test
display(df_a.describe().T,
        df_b.describe().T)

print('p-value:',
      sp.stats.ttest_ind(df_a.affairs,
                         df_b.affairs,
                         equal_var=False)[1])

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
affairs,447.0,1.524038,3.015937,0.0,0.0,0.532609,1.507691,26.87999


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
affairs,5919.0,0.643549,2.116982,0.0,0.0,0.0,0.347826,57.599991


p-value: 2.7446844166802127e-09


In [6]:
df = df_fair

df_a = df_fair[df.occupation == 2][['affairs']]  # farming-like
df_b = df_fair[df.occupation == 3][['affairs']]  # white-colloar

display(df_a.describe().T,
        df_b.describe().T)

print('p-value:',
      sp.stats.ttest_ind(df_a.affairs,
                         df_b.affairs,
                         equal_var=False)[1])

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
affairs,859.0,0.719556,2.375644,0.0,0.0,0.0,0.212121,26.87999


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
affairs,2783.0,0.755248,2.305594,0.0,0.0,0.0,0.583333,57.599991


p-value: 0.698381462473247
