# Numpy数组和矢量计算

Numpy(Numerical Python的简称)是高性能科学计算和数据分析的基础包。部分功能如下：
   - ndarray,一个具有矢量运算和复杂广播能力的快速且节省空间的多维数组
   - 用于对数组数据进行快速运算的标准数据函数
   - 线性代数随机数生成
   
   而对于数据分析而言，关注的功能主要集中在：常用的数组算法（排序、聚合以及转换等）

## 1） Numpy的ndarray:一种多维数组对象

### 创建ndarray

In [6]:
import  numpy as np
# 一个列表的转换
data1 = [1,2,3,4,5]
arr1 =np.array(data1)
print(arr1)

#嵌套列表
data2 =[[1,2,3],[4,5,6],[7,8,9],[10,11,12]]
arr2=np.array(data2)      # 将列表转换为数组
print(arr2)
print(arr2.ndim)          # 查看列表的纬度
print(arr2.shape)         # 查看列表的形状，几行几列的矩阵
print(arr2.dtype)         # 查看列表的数据类型

print(np.zeros(10))       # 产生一个全为0的1行10列的1维数组 
print(np.zeros((3,5)))    # 产生一个全为0的3行5列的1维数组
print(np.empty((2,3,2)))  # 产生一个全为0的2行3列的2维数组

# ones 全为1
# arange是Python内置函数range的数据版
print(np.arange(15))

[1 2 3 4 5]
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
2
(4, 3)
int32
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[[ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]]
[[[ 0.  0.]
  [ 0.  0.]
  [ 0.  0.]]

 [[ 0.  0.]
  [ 0.  0.]
  [ 0.  0.]]]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]


### ndarray的数据类型
numpy中的数据类型：
-     int8,uint8,int16,uint16,in32,uint32,int64,uint64,float16,float32,float64,float128,bool,object,string_等

In [8]:
arr1 = np.array([1,2,3],dtype=np.float64)
arr2 = np.array([1,2,3],dtype=np.int32)
print(arr1.dtype)
print(arr2.dtype)

float64
int32


In [10]:
# 数据类型转换
# 整数-浮点数
arr =np.array([1,2,3,4,5])
print(arr)
print(arr.dtype)
float_arr = arr.astype(np.float)
print(float_arr)
print(float_arr.dtype)

[1 2 3 4 5]
int32
[ 1.  2.  3.  4.  5.]
float64


In [13]:
# 浮点数-整数
arr = np.array([3.7,1.2,5,5])
print(arr)
print(arr.astype(np.int16))

[ 3.7  1.2  5.   5. ]
[3 1 5 5]


In [14]:
# 若字符串全是数字，也可以astype将其转换为数值形式
numeric_strings = np.array(['1.5','2','35','44.25'],dtype=np.string_)
print(numeric_strings.astype(float))

[  1.5    2.    35.    44.25]


### 数组和标量之间的运算

In [18]:
arr=np.array([[1,2,3],[4,5,6]])
print(arr*arr)
print(1/arr)
print(arr*5)

[[ 1  4  9]
 [16 25 36]]
[[ 1.          0.5         0.33333333]
 [ 0.25        0.2         0.16666667]]
[[ 5 10 15]
 [20 25 30]]


### 基本的索引和切片

In [20]:
arr =np.arange(10)
print(arr)
print(arr[6])
print(arr[0:3])
arr[0]=155
print(arr)

[0 1 2 3 4 5 6 7 8 9]
6
[0 1 2]
[155   1   2   3   4   5   6   7   8   9]


In [23]:
arr2d=np.array([[1,2,3],[4,5,6],[7,8,9]])
print(arr2d)
print(arr2d[1])
print(arr2d[1][0])
print(arr2d[0:][1,2])

[4 5 6]
4
6


### 数组转换和轴对换

In [27]:
arr1 = np.arange(15).reshape((3,5))
print(arr1)
print(arr1.T)

arr2 = np.array([[1,2,3],[1,2,3]])
print(np.dot(arr2.T,arr2))

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]
[[ 0  5 10]
 [ 1  6 11]
 [ 2  7 12]
 [ 3  8 13]
 [ 4  9 14]]
[[ 2  4  6]
 [ 4  8 12]
 [ 6 12 18]]


In [5]:
'''
rand与randn的区别
numpy.random.randn(d0, d1, …, dn)是从标准正态分布中返回一个或多个样本值。 
numpy.random.rand(d0, d1, …, dn)的随机样本位于[0, 1)中。
'''
import numpy as np

# 另外一种导入方式：from numpy.random import randn,rand 

data = {i:np.random.randn() for i in range(5)}

print(data)

data = {i:np.random.rand() for i in range(5)}

print(data)

{0: -1.913068244905515, 1: 0.18465530938289765, 2: 0.5536824836516111, 3: -0.9844002294197887, 4: 0.4547512178268144}
{0: 0.1878539478041038, 1: 0.12540885445255578, 2: 0.36758518442152943, 3: 0.5805947098370959, 4: 0.544021642310213}


In [6]:
# 获取当前的工作路径
%pwd

'G:\\github\\dataAnalyiseBook\\chapter4'

In [13]:
b = [1,2,3]
b?
# b?代表b的用法  ??返回详细的函数信息

In [19]:
# np.*load*?   #返回中间为load的函数

In [3]:
import numpy as np
a = np.random.randn(100,100)
%time np.dot(a,a)   # 一共的运行时间

Wall time: 510 ms


array([[ -4.39199305, -16.06132641,  -6.81388896, ...,  -0.79891336,
         -5.73936725,  -1.63301216],
       [  5.53221255,  -4.20729112,   8.53277075, ..., -18.9793878 ,
         -1.27689202,   3.90205513],
       [  5.28982285,  -5.94705066,  21.2750335 , ...,  -0.48327212,
         -7.22612995,   2.04786143],
       ..., 
       [  6.43084178,  21.40960355,  13.00338552, ...,  -5.80430076,
         -3.53148021, -11.39944941],
       [  2.14207432,  -6.55984694,  21.07029032, ...,  -5.59399282,
          3.66479254,   4.5023155 ],
       [  7.66987406,  -3.44535068,  -7.75636457, ...,  -7.71485194,
        -14.68241974,  18.30309364]])

In [4]:
%timeit np.dot(a,a)  # 每次的运行时间及每秒

The slowest run took 19.06 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 52 µs per loop
