# Sampling Method

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

In [2]:
housing = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data',header=None, sep='\s+',
                       names = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE","DIS", 
                                             "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"])
housing.shape

(506, 14)

In [3]:
housing

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311.0,15.2,395.60,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311.0,15.2,396.90,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311.0,15.2,386.71,17.10,18.9


In [4]:
from scipy.stats import skew

stats = housing.describe().T
stats['skew'] = skew(housing)
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,skew
CRIM,506.0,3.613524,8.601545,0.00632,0.082045,0.25651,3.677082,88.9762,5.207652
ZN,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0,2.219063
INDUS,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74,0.294146
CHAS,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0,3.395799
NOX,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871,0.727144
RM,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78,0.402415
AGE,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0,-0.597186
DIS,506.0,3.795043,2.10571,1.1296,2.100175,3.20745,5.188425,12.1265,1.008779
RAD,506.0,9.549407,8.707259,1.0,4.0,5.0,24.0,24.0,1.001833
TAX,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0,0.667968


## Taking Samples of Various Sizes and Comparing them to the Original

In [5]:
import random

np.random.seed(12)

sample1pct = housing.sample(5)
sample1pctstats = pd.DataFrame({
                    'Sample Mean': sample1pct.mean(),
                    'Diff from Mean': sample1pct.mean() - stats['mean'],
                    'Norm diff fr M': (sample1pct.mean() - stats['mean'])/stats['std']
})
sample1pctstats

Unnamed: 0,Diff from Mean,Norm diff fr M,Sample Mean
CRIM,7.485732,0.870278,11.099256
ZN,-4.763636,-0.204251,6.6
INDUS,-1.182779,-0.172408,9.954
CHAS,0.13083,0.515091,0.2
NOX,0.021305,0.183857,0.576
RM,-0.043034,-0.061249,6.2416
AGE,4.505099,0.160046,73.08
DIS,-0.359323,-0.170642,3.43572
RAD,4.050593,0.465197,13.6
TAX,8.762846,0.051994,417.0


In [6]:
sample5pct = housing.sample(25)
sample5pctstats = pd.DataFrame({
                    'Sample Mean': sample5pct.mean(),
                    'Diff from Mean': sample5pct.mean() - stats['mean'],
                    'Norm diff fr M': (sample5pct.mean() - stats['mean'])/stats['std']
})
sample5pctstats

Unnamed: 0,Diff from Mean,Norm diff fr M,Sample Mean
CRIM,2.440832,0.283767,6.054355
ZN,-6.623636,-0.284003,4.74
INDUS,1.276421,0.186058,12.4132
CHAS,0.05083,0.200123,0.12
NOX,0.041665,0.35956,0.59636
RM,-0.245634,-0.349599,6.039
AGE,3.533099,0.125515,72.108
DIS,-0.472399,-0.224342,3.322644
RAD,1.250593,0.143626,10.8
TAX,22.882846,0.135773,431.12


In [7]:
sample10pct = housing.sample(50)
sample10pctstats = pd.DataFrame({
                    'Sample Mean': sample10pct.mean(),
                    'Diff from Mean': sample10pct.mean() - stats['mean'],
                    'Norm diff fr M': (sample10pct.mean() - stats['mean'])/stats['std']
})
sample10pctstats

Unnamed: 0,Diff from Mean,Norm diff fr M,Sample Mean
CRIM,0.430527,0.050052,4.04405
ZN,-3.613636,-0.154942,7.75
INDUS,-0.193179,-0.028159,10.9436
CHAS,0.03083,0.121381,0.1
NOX,-0.010455,-0.090225,0.54424
RM,0.007226,0.010284,6.29186
AGE,1.747099,0.062066,70.322
DIS,-0.081623,-0.038763,3.71342
RAD,1.030593,0.11836,10.58
TAX,12.262846,0.072761,420.5


In [8]:
sample20pct = housing.sample(100)
sample20pctstats = pd.DataFrame({
                    'Sample Mean': sample20pct.mean(),
                    'Diff from Mean': sample20pct.mean() - stats['mean'],
                    'Norm diff fr M': (sample20pct.mean() - stats['mean'])/stats['std']
})
sample20pctstats

Unnamed: 0,Diff from Mean,Norm diff fr M,Sample Mean
CRIM,0.198125,0.023034,3.811648
ZN,2.261364,0.096961,13.625
INDUS,-0.169579,-0.024719,10.9672
CHAS,0.01083,0.042639,0.08
NOX,-0.007264,-0.062687,0.547431
RM,0.001966,0.002798,6.2866
AGE,-4.691901,-0.166682,63.883
DIS,0.232004,0.110179,4.027047
RAD,0.580593,0.066679,10.13
TAX,7.172846,0.042559,415.41


In [9]:
sample30pct = housing.sample(150)
sample30pctstats = pd.DataFrame({
                    'Sample Mean': sample30pct.mean(),
                    'Diff from Mean': sample30pct.mean() - stats['mean'],
                    'Norm diff fr M': (sample30pct.mean() - stats['mean'])/stats['std']
})
sample30pctstats

Unnamed: 0,Diff from Mean,Norm diff fr M,Sample Mean
CRIM,-0.353152,-0.041057,3.260372
ZN,1.939697,0.083169,13.303333
INDUS,-0.147979,-0.02157,10.9888
CHAS,0.024163,0.095134,0.093333
NOX,-0.012022,-0.103751,0.542673
RM,-0.011568,-0.016464,6.273067
AGE,-1.681568,-0.059738,66.893333
DIS,0.233608,0.11094,4.028651
RAD,-0.096074,-0.011034,9.453333
TAX,-0.743821,-0.004413,407.493333


## Taking a Sample of 5 multpile times
This method, due to the Central Limit Theorem, will give you values that approximate the mean

In [10]:
sample_means = []
for _ in range(10):
    sample_means.append(housing.sample(5).mean())

sample_means = np.array(sample_means)
(sample_means.mean(axis=0)-stats['mean'])/stats['std']

CRIM       0.107380
ZN        -0.229120
INDUS      0.318237
CHAS       0.042639
NOX        0.213371
RM         0.156651
AGE        0.072440
DIS       -0.186401
RAD        0.189565
TAX        0.204838
PTRATIO    0.131397
B         -0.147089
LSTAT     -0.052942
MEDV       0.129083
dtype: float64

In [11]:
sample_means = []
for _ in range(50):
    sample_means.append(housing.sample(5).mean())

sample_means = np.array(sample_means)
(sample_means.mean(axis=0)-stats['mean'])/stats['std']

CRIM       0.008109
ZN         0.013565
INDUS     -0.032217
CHAS      -0.036103
NOX       -0.112531
RM        -0.144486
AGE       -0.070074
DIS        0.090802
RAD       -0.040128
TAX       -0.068906
PTRATIO    0.029038
B          0.112598
LSTAT     -0.004188
MEDV      -0.003219
dtype: float64

In [12]:
sample_means = []
for _ in range(100):
    sample_means.append(housing.sample(5).mean())

sample_means = np.array(sample_means)
(sample_means.mean(axis=0)-stats['mean'])/stats['std']

CRIM      -0.029921
ZN         0.034317
INDUS     -0.009233
CHAS      -0.028229
NOX        0.008044
RM         0.056870
AGE       -0.037561
DIS        0.013912
RAD       -0.042195
TAX        0.005238
PTRATIO   -0.072766
B         -0.016299
LSTAT     -0.084332
MEDV       0.034945
dtype: float64

## Discussion
To get a description of this data set, I would either take a sample of 20-30% or take a 1% sample 50-100 times. As you can see, taking the small sample repeatedly actually gets us closer to the mean of the original data. Both of these are sufficient to describe the dataset