<a href="https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch02_what_are_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modern statistics: Intuition, Math, Python, R
## Mike X Cohen (sincxpress.com)
#### https://www.amazon.com/dp/B0CQRGWGLY
#### Code for chapter 2 (What are data?)

---

# About this code file:

### This notebook will reproduce most of the figures in this chapter (some figures were made in Inkscape), and illustrate the statistical concepts explained in the text. The point of providing the code is not just for you to recreate the figures, but for you to modify, adapt, explore, and experiment with the code.

### Solutions to all exercises are at the bottom of the notebook.

#### This code was written in google-colab. The notebook may require some modifications if you use a different IDE.

In [6]:
# import libraries and define global settings
import numpy as np
import matplotlib.pyplot as plt

# define global figure properties used for publication
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format
plt.rcParams.update({'font.size':14,             # font size
                     'savefig.dpi':300,          # output resolution
                     'axes.titlelocation':'left',# title location
                     'axes.spines.right':False,  # remove axis bounding box
                     'axes.spines.top':False,    # remove axis bounding box
                     })

# Figure 2.3: The "6" data, as an image and numbers

In [None]:
# import MNIST data
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', as_frame=False, cache=False, parser='auto')

In [None]:
# show one number
_,axs = plt.subplots(1,2,figsize=(8,6))

# the image of the number
I = mnist.data[18].reshape(28,28)[2:24,:][:,8:-7]
axs[0].imshow(I,cmap='gray')
axs[0].axis('off')

axs[1].imshow(I,cmap='gray',vmin=-1,vmax=0)
axs[1].axis('off')

# and the numbers of the number
for i in range(I.shape[0]):
  for j in range(I.shape[1]):
    axs[1].text(j,i,int(I[i][j]),fontsize=8
                 ,horizontalalignment='center',verticalalignment='center')

plt.tight_layout()
plt.savefig('whatR_mnist.png')
plt.show()

# Figure 2.5: Margin figure with noisy data

In [None]:
# generate data
n = 30
x = np.random.randn(n)
y1 = x + np.random.randn(n)/10
y2 = x + np.random.randn(n)

_,axs = plt.subplots(2,1,figsize=(2,4))

axs[0].plot(x,np.polyval(np.polyfit(x,y1,1),x),color='gray')
axs[0].plot(x,y1,'ws',markeredgecolor='k')
axs[0].set_title('Less noise',loc='center')

axs[1].plot(x,np.polyval(np.polyfit(x,y2,1),x),color='gray')
axs[1].plot(x,y2,'ws',markeredgecolor='k')
axs[1].set_title('More noise',loc='center')

for a in axs:
  a.set_xticks([])
  a.set_xlabel('x')
  a.set_yticks([])
  a.set_ylabel('y')

plt.tight_layout()
plt.savefig('whatR_noisyData.png')
plt.show()

# Figure 2.6: Margin figure with outlier

In [None]:
X = np.random.randn(12)
X[6] = 2*np.pi

plt.figure(figsize=(4,2))
plt.plot(X,'ko',markersize=10)
plt.plot(6,X[6],'ko',markersize=10,markerfacecolor=(.7,.7,.7))
plt.xticks([])
plt.yticks([])
plt.ylim([np.min(X)-.6,np.max(X)+.6])
plt.xlabel('Data index')
plt.ylabel('Data value')

plt.tight_layout()
plt.savefig('whatR_outlier.png')
plt.show()