### NumPy (short form for Numerical Python) is the most fundamental package designed for scientific computing and data analysis. Most of the other packages such as pandas, statsmodels are built on top of it, and is an important package to know and learn about. At the heart of NumPy is a data structure called **ndarray**. ndarray is a basically a multi-dimensional array that is built specifically for the purpose of numerical data analysis. Python also has array capabilities, but they are more generic. <b>The advantage of using ndarray is that processing is extremely efficient and fast.</b> 


In [2]:
import numpy as np
my_list = [1,2,3]
np.array(my_list)

array([1, 2, 3])

## Arange
Return evenly spaced values within a given interval.

In [3]:
np.arange(0,10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

## zeros and ones

In [8]:
print(np.zeros(3))
np.zeros((5,5))

[0. 0. 0.]


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

## linspace

In [11]:
np.linspace(0,10,5)

array([ 0. ,  2.5,  5. ,  7.5, 10. ])

In [13]:
np.eye(3)   ## Matirx with all Zeror and 1 in digonal

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [15]:
np.diag(np.array([1,3,5,3,4,5]))        # Populate the disgonal elements only

array([[1, 0, 0, 0, 0, 0],
       [0, 3, 0, 0, 0, 0],
       [0, 0, 5, 0, 0, 0],
       [0, 0, 0, 3, 0, 0],
       [0, 0, 0, 0, 4, 0],
       [0, 0, 0, 0, 0, 5]])

## randn 
1.Return a sample (or samples) from the "standard normal" distribution. Unlike rand which is uniform:

In [20]:
print(np.random.randint(1,100))  # 1 Element between 1 and 100
print(np.random.randint(1,100,10))  # 10 Random Element between 1 and 100 values will change everytime you run.

1
[86 73 25 59 90 75 80 13 60 62]


## Arrange

In [25]:
arr = np.arange(5)  # This will create no in range from 0 till max -1 
ranarr = np.random.randint(0,50,5) # Random Number between 0 and 50 , 5 Numbers
print(arr)
print(ranarr)

[0 1 2 3 4]
[48 35 44 14 47]


## Max,Min,Argmax,Argmin

1. argmax , argmin is used for range list
1. max , min is used in random int max and min

In [47]:
arr = np.arange(10)  # This will create no in range from 0 till max -1 
ranarr = np.random.randint(0,50,10)
print("List Arr is ",arr)
print("List Ranarr is ",ranarr)

print("Min of arr :",arr.argmin()," Max of arr :",arr.argmax())
print("Min of Ranarr :",ranarr.min()," Max of Ranarr :",ranarr.max()) ## Please note in cose or random variable its max and not argmax 


List Arr is  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
List Ranarr is  [49 22 27 49  0 28 16  5 25 17]
Min of arr : 0  Max of arr : 29
Min of Ranarr : 0  Max of Ranarr : 49


## Shape
1. Shape is an attribute that arrays have (not a method):
1. You can get the dimension of matrix by counting the no of [in the Beggining and end ]

In [50]:
print(arr)
arr.shape
arr.reshape(2,5,3)  ## Reshape Works with x * y or Matrix should be divsible by new matrix x * y

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]


array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11],
        [12, 13, 14]],

       [[15, 16, 17],
        [18, 19, 20],
        [21, 22, 23],
        [24, 25, 26],
        [27, 28, 29]]])

## Indexing Slicing of Array
1. All the Indexing of an Array is same of List 
1. You can get index of Item of an aray as well 


In [57]:
#Get a value at an index
print(ranarr)
ranarr[ranarr.max()]  # Identify the error here 

[49 22 27 49  0 28 16  5 25 17]


IndexError: index 49 is out of bounds for axis 0 with size 10

In [60]:
print(arr)
arr[8]  # Identify the error here 

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]


8

In [61]:
#Get values in a range
arr[1:5]

array([1, 2, 3, 4])

In [71]:
## Answer to Question earlier there can be multiple method of doing 
## ** is to give value else it will return array[0]
print(ranarr)
print(ranarr.max())
i=np.where(ranarr.max())
print(*i)

[49 22 27 49  0 28 16  5 25 17]
49
[0]


In [67]:
list_numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
element = 3
list_numbers.index(element)

2

Orginal List  [19 16 29 24 45  6  8  1 42 11 48 10 10 23 26 46 37 28 10 28]
************************************************
Array Conversion of Above is :
************************************************
[[[19 16 29 24 45]
  [ 6  8  1 42 11]]

 [[48 10 10 23 26]
  [46 37 28 10 28]]]
Shape of Arraya is   : (2, 2, 5)
Datatype of Array is :  int32
20
[False  True False  True False  True  True False  True False  True  True
  True False  True  True False  True  True  True]
Orginal List  [19 16 29 24 45  6  8  1 42 11 48 10 10 23 26 46 37 28 10 28]
Even Numbers are : [16 24  6  8 42 48 10 10 26 46 28 10 28]
ODD Numbers are : [19 29 45  1 11 23 37]


## Slicing between Array

In [88]:
arr_2d = np.array(([1,3,5],[2,4,6],[3,6,9]))

In [90]:
print(arr_2d)
arr_2d[:2,1:]  ## Here we are going from 0 -2 (not 2 which row 1 and 2) column to end 

[[1 3 5]
 [2 4 6]
 [3 6 9]]


array([[3, 5],
       [4, 6]])

## Conditional Statement inside a array

In [96]:
print(data)
print(data[data %5 ==0])
print(data[data  >10]) ## Comparision Operator in Numpy is much used in Pandas

[19 16 29 24 45  6  8  1 42 11 48 10 10 23 26 46 37 28 10 28]
[45 10 10 10]
[19 16 29 24 45 42 11 48 23 26 46 37 28 28]


In [112]:
data = np.random.randint(0,50,20)
data2 = data.reshape(2,2,5)  # Shape must be divisible by X* Y of Matrix 
print("Orginal List ",data)
print ("************************************************")
print("Array Conversion of Above is :")
print ("************************************************")
print (data2)
print("Shape of Arraya is   :",data2.shape)
print("Datatype of Array is : ",data2.dtype)
print(data2.size)
even_data = (data % 2 == 0)
odd_data = (data % 2 != 0)
print (even_data)
print("Orginal List ",data)
print("Even Numbers are :",data[even_data])
print("ODD Numbers are :",data[odd_data])

Orginal List  [ 7  2 22 38 40 16 44  2  2  9 48 30 49 17  8 24  6 10 47 47]
************************************************
Array Conversion of Above is :
************************************************
[[[ 7  2 22 38 40]
  [16 44  2  2  9]]

 [[48 30 49 17  8]
  [24  6 10 47 47]]]
Shape of Arraya is   : (2, 2, 5)
Datatype of Array is :  int32
20
[False  True  True  True  True  True  True  True  True False  True  True
 False False  True  True  True  True False False]
Orginal List  [ 7  2 22 38 40 16 44  2  2  9 48 30 49 17  8 24  6 10 47 47]
Even Numbers are : [ 2 22 38 40 16 44  2  2 48 30  8 24  6 10]
ODD Numbers are : [ 7  9 49 17 47 47]


In [113]:
data3 = np.arange(30)
data3=data3.reshape(5,6)
print(data3)
## Finding 7 -19 rows and 2 -4 columns
data3[1:4,1:5]

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]]


array([[ 7,  8,  9, 10],
       [13, 14, 15, 16],
       [19, 20, 21, 22]])

### Pandas 

In [130]:
import pandas as pd 
from numpy.random import randn
np.random.seed(102)

In [138]:
np.random.randn(4,5)

array([[-0.06438783,  1.27497947,  0.91250325, -0.18641164,  0.05877293],
       [ 0.78948441,  0.97982507, -2.65083484, -0.30626969, -0.71972948],
       [ 0.11739439, -0.46244105, -0.94054334, -1.77898595,  0.52938724],
       [ 0.83135192, -0.25864548,  0.167274  , -0.59389158, -0.56307009]])

In [142]:
df = pd.DataFrame(np.random.randn(4,5),index='A B C D'.split(),columns='U W X Y Z'.split())

In [143]:
df

Unnamed: 0,U,W,X,Y,Z
A,1.315358,0.923098,0.164003,0.854101,-1.307599
B,-1.485952,0.870746,0.575038,0.246979,-0.601873
C,-0.465131,0.183339,0.82668,-1.362105,-0.216016
D,0.206353,-1.641117,-1.666657,0.034567,-0.329217


In [148]:
## Print One or More Rows in Data Frame.
print(df['W'])
print(df[['W','Z']])
## Selecting Row in Data Frame
df[3:]

A    0.923098
B    0.870746
C    0.183339
D   -1.641117
Name: W, dtype: float64
          W         Z
A  0.923098 -1.307599
B  0.870746 -0.601873
C  0.183339 -0.216016
D -1.641117 -0.329217


Unnamed: 0,U,W,X,Y,Z,new
D,0.206353,-1.641117,-1.666657,0.034567,-0.329217,-1.606549


In [145]:
type(df['W']) ## Every Column in Dataframe is just a series in Pandas

pandas.core.series.Series

## Adding Removing Columns in Dataframes

In [146]:
df['new'] = df['W'] + df['Y']
print(df)
df.drop('new',axis=1)

          U         W         X         Y         Z       new
A  1.315358  0.923098  0.164003  0.854101 -1.307599  1.777199
B -1.485952  0.870746  0.575038  0.246979 -0.601873  1.117725
C -0.465131  0.183339  0.826680 -1.362105 -0.216016 -1.178767
D  0.206353 -1.641117 -1.666657  0.034567 -0.329217 -1.606549


Unnamed: 0,U,W,X,Y,Z
A,1.315358,0.923098,0.164003,0.854101,-1.307599
B,-1.485952,0.870746,0.575038,0.246979,-0.601873
C,-0.465131,0.183339,0.82668,-1.362105,-0.216016
D,0.206353,-1.641117,-1.666657,0.034567,-0.329217


## Selecting Value from Index

1. loc works with Value of Index for exampl if D is index we search all columns of rows with index of D
2. iloc works with location for example 0 , 1 100 or so on . 

In [149]:
## Selecting Data Based Upon Index or Index Value 
print(df.loc['D'])
print(df.iloc[3])

U      0.206353
W     -1.641117
X     -1.666657
Y      0.034567
Z     -0.329217
new   -1.606549
Name: D, dtype: float64
U      0.206353
W     -1.641117
X     -1.666657
Y      0.034567
Z     -0.329217
new   -1.606549
Name: D, dtype: float64


## Conditional Select

In [158]:
print(df[df['W']>0][['Y','X']])
print(df[df >0])
print(df[df['W']<0][['Y','X']])  ## W >0 and Select Y and X 
print(df[(df['W']<0) & (df['Y'] < 1)]) # Using more than One Condition with and)
print(df[(df['W']<0) | (df['Y'] < 1)]) # Using Or Operator 

          Y         X
A  0.854101  0.164003
B  0.246979  0.575038
C -1.362105  0.826680
          U         W         X         Y   Z       new
A  1.315358  0.923098  0.164003  0.854101 NaN  1.777199
B       NaN  0.870746  0.575038  0.246979 NaN  1.117725
C       NaN  0.183339  0.826680       NaN NaN       NaN
D  0.206353       NaN       NaN  0.034567 NaN       NaN
          Y         X
D  0.034567 -1.666657
          U         W         X         Y         Z       new
D  0.206353 -1.641117 -1.666657  0.034567 -0.329217 -1.606549


## Dealing with Null Values 

In [160]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [166]:
print("*********************Orginal Data Fram *********************")
print(df)
print(df.dropna())## Drop all NA Values
print(df.dropna(axis=1)) ## This for Dropping Columns axis=0 is for rows
print(df.dropna(thresh=2)) ## Drop only if threshold is met i.e 2 Na values are there

*********************Orginal Data Fram *********************
     A    B  C
0  1.0  5.0  1
1  2.0  NaN  2
2  NaN  NaN  3
     A    B  C
0  1.0  5.0  1
   C
0  1
1  2
2  3
     A    B  C
0  1.0  5.0  1
1  2.0  NaN  2


## Filling Na Values

In [167]:
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1,5,1
1,2,FILL VALUE,2
2,FILL VALUE,FILL VALUE,3


## Group By Function 

In [170]:
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
df1=pd.DataFrame(data)
df1

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [173]:
df1.groupby('Company') ## This will Create and Object for Group 
df2=df1.groupby('Company')

In [183]:
print(df1)
print(df2.mean()) 
print(df2.sum())
print(df2.count())
df2.describe() # Most Effective way of getting all 
df2.describe().transpose() ## All in One Command please aggregate will work only numeric fields

  Company   Person  Sales
0    GOOG      Sam    200
1    GOOG  Charlie    120
2    MSFT      Amy    340
3    MSFT  Vanessa    124
4      FB     Carl    243
5      FB    Sarah    350
         Sales
Company       
FB       296.5
GOOG     160.0
MSFT     232.0
         Sales
Company       
FB         593
GOOG       320
MSFT       464
         Person  Sales
Company               
FB            2      2
GOOG          2      2
MSFT          2      2


Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0


## Merge and Joins

1. Concatenation basically glues together DataFrames. Keep in mind that dimensions should match along the axis you are concatenating on. You can use pd.concat and pass in a list of DataFrames to concatenate together:
1. The merge function allows you to merge DataFrames together using a similar logic as merging SQL Tables together. For example:


In [184]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7]) 
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

In [186]:
print(df1)
print(df2)
print(df3)

    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
2  A2  B2  C2  D2
3  A3  B3  C3  D3
    A   B   C   D
4  A4  B4  C4  D4
5  A5  B5  C5  D5
6  A6  B6  C6  D6
7  A7  B7  C7  D7
      A    B    C    D
8    A8   B8   C8   D8
9    A9   B9   C9   D9
10  A10  B10  C10  D10
11  A11  B11  C11  D11


In [185]:
pd.concat([df1,df2,df3])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [188]:
pd.concat([df1,df2,df3],axis=1) # Here we are conact on Columns so it will created additional column filled with NA values

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


In [192]:
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})    
print(left)
print(right)

  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3


In [193]:
pd.merge(left,right,how='inner',on='key') # Here the Value of Key is the Same 

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


## Apply and Lambda Function 

In [198]:
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [196]:
def times2(x):
    return x*2

In [201]:
df['col1'].apply(times2)
df['col2'].apply(times2)
df['col3'].apply(len)

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

## Importan Functions in Pandas

In [207]:
df[df['col3'].str.contains("e|i|o|u|a")]

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi


## String Function

In [None]:
df['col3'].str.split('@').str[-1].value_counts().head()