In [1]:
import numpy as np

# Create 4 X 4 array of ones
c = np.ones((4, 4))

# Cross Product
print(c*c)

# Dot Product
print(c.dot(c))

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
[[4. 4. 4. 4.]
 [4. 4. 4. 4.]
 [4. 4. 4. 4.]
 [4. 4. 4. 4.]]


# Stacking
Merging two arrays. In 2 dimension arrays think **vstack** as appending rows and **hstack** as appending columns


In [2]:
# Here we have two arrays with 5 column each.
y = np.arange(15).reshape(3,5)
x = np.arange(10).reshape(2,5)
print(x)
print(y)

[[0 1 2 3 4]
 [5 6 7 8 9]]
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]


In [3]:
# Stack them
new_array = np.vstack((x,y))
print(new_array)

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]


In [4]:
# Here we have two arrays with 3 rows each.
y = np.arange(9).reshape(3,3)
x = np.arange(6).reshape(3,2)
print(x)
print(y)

[[0 1]
 [2 3]
 [4 5]]
[[0 1 2]
 [3 4 5]
 [6 7 8]]


In [5]:
# Stack them horizontally
new_array = np.hstack((x,y))
print(new_array)

[[0 1 0 1 2]
 [2 3 3 4 5]
 [4 5 6 7 8]]


### Example of stacking
Lets say we have a dataset of 100 rows and 10 columns (features) and if we want to draaw staticstics per column, we can use stakcing to summarize the information.

To understand skewness and kurtosis mesures see: https://codeburst.io/2-important-statistics-terms-you-need-to-know-in-data-science-skewness-and-kurtosis-388fef94eeaa


In [6]:
from scipy import stats
x= np.random.rand(100,10)

#scipy stats method will derive all the statistics
n, min_max, mean, var, skew, kurt = stats.describe(x)

#now stack them vertically 
new_array = np.vstack((mean,var,skew,kurt,min_max[0],min_max[1]))

#Print the statistics for all 10 columns
print(new_array.T)

[[ 4.84483879e-01  9.93595520e-02  8.11534949e-02 -1.31087330e+00
   2.80647394e-03  9.99395310e-01]
 [ 5.14920465e-01  8.61108178e-02 -2.42225754e-02 -1.16463700e+00
   2.51078418e-04  9.97589687e-01]
 [ 5.32700611e-01  8.99330967e-02 -1.34360397e-01 -1.14509174e+00
   7.69218770e-03  9.94483291e-01]
 [ 4.49757209e-01  8.24285904e-02  1.71605052e-01 -1.05414232e+00
   5.58584143e-03  9.97929751e-01]
 [ 5.06043018e-01  9.24241159e-02 -8.93389324e-02 -1.46059087e+00
   5.29137519e-03  9.86582052e-01]
 [ 4.94356429e-01  7.56801847e-02 -2.01703815e-02 -1.11908931e+00
   1.70669330e-02  9.81645263e-01]
 [ 4.84602740e-01  8.62668071e-02  5.74149277e-02 -1.22089777e+00
   7.79204559e-03  9.97468079e-01]
 [ 4.98738663e-01  8.46005913e-02  5.40350603e-02 -1.27496020e+00
   7.94756146e-03  9.82150542e-01]
 [ 4.91139140e-01  8.19377771e-02 -6.23904556e-02 -1.19373654e+00
   7.57957868e-03  9.87025647e-01]
 [ 4.84261534e-01  9.77259401e-02  1.02150146e-01 -1.31233259e+00
   2.94906903e-03  9.9302

### Mask Array Elements

In [8]:
import numpy.ma as ma
x = np.arange(6) # [0,1,2,3,4,5]
print (x)
print(x.mean())
masked_array = ma.masked_array(x, mask=[1,0,0,0,0,0])
masked_array.mean()

[0 1 2 3 4 5]
2.5


3.0

Masking is useful to replace NaN values

In [9]:
x = np.arange(25, dtype=float).reshape(5,5)
x[x<5] =np.nan #make first row NaN
x

array([[nan, nan, nan, nan, nan],
       [ 5.,  6.,  7.,  8.,  9.],
       [10., 11., 12., 13., 14.],
       [15., 16., 17., 18., 19.],
       [20., 21., 22., 23., 24.]])

In [11]:
#np.where(condition, value for true, value for false) - its like Excel If formula
np.where(np.isnan(x), ma.array(x, mask=np.isnan(x)).mean(axis=0), x) # replace by column mean for non NaN values

array([[12.5, 13.5, 14.5, 15.5, 16.5],
       [ 5. ,  6. ,  7. ,  8. ,  9. ],
       [10. , 11. , 12. , 13. , 14. ],
       [15. , 16. , 17. , 18. , 19. ],
       [20. , 21. , 22. , 23. , 24. ]])