# Numpy & Pandas Introduction

<font color='steelblue'>

<span style="font-family:Arial; font-size:1.6em;">
    <b>Pandas and Numpy Examples</b><br><br>
    Number of examples of using numpy and pandas libraries<br><br>
</span>
<span style="font-family:Arial; font-size:1.4em;">
    <b>Following examples are included in the processing:</b>
    <ol>
        <li>Using Numpy Arrays</li>
        <li>Pandas Series Object</li>
        <li>Pandas Dataframe</li>
        <li>Writing Dataframe to .csv file</li>
        <li>Reading .csv file into dataframe</li>
        <li>Exploring data in dataframe</li>
        <li>Basic Statistics on dataframe</li>
        <li>Applying a function to a column in DataFrame</li>
    </ol>    
</span>

</font>

In [1]:
import numpy as np
import pandas as pd

In [2]:
!python --version

Python 3.7.3


# Numpy Examples

### Array of Rank 1

In [3]:
# Create Array of Rank 1
arr1 = np.array([1, 2, 3])   

print(f"type: {type(arr1)}")   

print("shape: {}".format(arr1.shape))            

print("Elements in array: {} {} {}".format(arr1[0], arr1[1], arr1[2]))

# change the value at index 0
arr1[0] = 4                 
print(arr1)

type: <class 'numpy.ndarray'>
shape: (3,)
Elements in array: 1 2 3
[4 2 3]


### Array Rank 2

In [4]:
# Create Array of Rank 2
arr2 = np.array([[1,2,3],[4,5,6]])

# print shape
print("shape: {}".format(arr2.shape))                  

# print some elements
print("specific elements: {} {} {}".format(arr2[0, 0], arr2[0, 1], arr2[1, 0]))

# print the array
print("Row 0: {} Row 1: {}".format(arr2[0], arr2[1]))

arr2

shape: (2, 3)
specific elements: 1 2 4
Row 0: [1 2 3] Row 1: [4 5 6]


array([[1, 2, 3],
       [4, 5, 6]])

### Create Numpy Arrays

In [5]:
# Create 2x2 array initialize with zeros
a = np.zeros((2,2))
print("Array with zeros: \n{}\n".format(a))              
                      
# Create an 1x2 array of all ones
b = np.ones((1,2))    # Create an array of all ones
print("Array with ones: \n{}\n".format(b))

# Create a 2x2 constant array
c = np.full((2,2), 7)  
print("Array with constant values: \n{}\n".format(c))
                       
# Create an array filled with random values
d = np.random.random((2,2))  
print("Array with random number: \n{}\n".format(d))

e = np.random.randint(2, size = 5)
print("Random ints 0 and 1: {}".format(e))

f = np.random.randint(5, size = 10)
print("Random ints 0 and 1: {}".format(f))

Array with zeros: 
[[0. 0.]
 [0. 0.]]

Array with ones: 
[[1. 1.]]

Array with constant values: 
[[7 7]
 [7 7]]

Array with random number: 
[[0.32578392 0.64820331]
 [0.18684268 0.3302388 ]]

Random ints 0 and 1: [1 1 1 0 1]
Random ints 0 and 1: [1 0 1 4 1 3 0 4 0 4]


### Numpy Array Indexing

In [6]:
# 3x4 array
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print("shape of a: {}".format(a.shape))

# Note use of f strings in python
print(f"a: \n{a}\n")

# Use slicing to pull out the subarray consisting of the first 2 rows
# and columns 1 and 2; b is the following array of shape (2, 2):
# [[2 3]
#  [6 7]]
b = a[:2, 1:3]

print(f"b: \n{b}\n")

# A slice of an array is a view into the same data, so modifying it
# will modify the original array.
print(f"row 0 col 1 value: {a[0, 1]}") 

# b[0, 0] is the same piece of data as a[0, 1]
b[0, 0] = 77 
print(f"for a row 0 col 1 value: {a[0, 1]}")   

shape of a: (3, 4)
a: 
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]

b: 
[[2 3]
 [6 7]]

row 0 col 1 value: 2
for a row 0 col 1 value: 77


# Pandas Examples

## Series object

In [7]:
series_obj = pd.Series([10,20,30,40,50])
series_obj

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [8]:
# index access
series_obj[0]

10

### Element-wise operations

In [9]:
series_ages = pd.Series([31,22,43,44,55])
series_ages

0    31
1    22
2    43
3    44
4    55
dtype: int64

In [10]:
series_ages + series_ages

0     62
1     44
2     86
3     88
4    110
dtype: int64

In [11]:
series_ages * 2

0     62
1     44
2     86
3     88
4    110
dtype: int64

In [12]:
series_ages + 100

0    131
1    122
2    143
3    144
4    155
dtype: int64

### Boolean selection

In [13]:
series_ages > 40

0    False
1    False
2     True
3     True
4     True
dtype: bool

In [14]:
#boolean access
series_ages[series_ages > 40]

2    43
3    44
4    55
dtype: int64

## DataFrame object

In [15]:
# create a DataFrame using dictionary (of Series objects)
data = {"Name": ["Tim Miller", "Ann Carter", "Ellen Lee", "Sam Carr", \
                 "Al Ball", "Carl Zee", "Sara Martin"], 
        "Gender": ["Male", "Female", "Female", "Male", \
                   "Male", "Male", "Female"],
        "Age": [32, 44, 21, 19, 45, 27, 39]}
df = pd.DataFrame(data)

#when using print(), the DataFrame does not display as an HTML table
# print(df)  
df

Unnamed: 0,Name,Gender,Age
0,Tim Miller,Male,32
1,Ann Carter,Female,44
2,Ellen Lee,Female,21
3,Sam Carr,Male,19
4,Al Ball,Male,45
5,Carl Zee,Male,27
6,Sara Martin,Female,39


In [16]:
print(df)

          Name  Gender  Age
0   Tim Miller    Male   32
1   Ann Carter  Female   44
2    Ellen Lee  Female   21
3     Sam Carr    Male   19
4      Al Ball    Male   45
5     Carl Zee    Male   27
6  Sara Martin  Female   39


### Dataframe operations

In [17]:
# show first 5 rows
df.head()

Unnamed: 0,Name,Gender,Age
0,Tim Miller,Male,32
1,Ann Carter,Female,44
2,Ellen Lee,Female,21
3,Sam Carr,Male,19
4,Al Ball,Male,45


In [18]:
# show last 5 rows
df.tail()

Unnamed: 0,Name,Gender,Age
2,Ellen Lee,Female,21
3,Sam Carr,Male,19
4,Al Ball,Male,45
5,Carl Zee,Male,27
6,Sara Martin,Female,39


In [19]:
# returns a column/Series object
df['Name']     # dictionary notation

0     Tim Miller
1     Ann Carter
2      Ellen Lee
3       Sam Carr
4        Al Ball
5       Carl Zee
6    Sara Martin
Name: Name, dtype: object

In [20]:
df.Name     # attribute notation; Tab completion

0     Tim Miller
1     Ann Carter
2      Ellen Lee
3       Sam Carr
4        Al Ball
5       Carl Zee
6    Sara Martin
Name: Name, dtype: object

In [21]:
# assignment by column (or add a column)
df["Birth Year"] = 1999
df

Unnamed: 0,Name,Gender,Age,Birth Year
0,Tim Miller,Male,32,1999
1,Ann Carter,Female,44,1999
2,Ellen Lee,Female,21,1999
3,Sam Carr,Male,19,1999
4,Al Ball,Male,45,1999
5,Carl Zee,Male,27,1999
6,Sara Martin,Female,39,1999


In [22]:
# assignment by column (or add a column)
# must match the length of the DataFrame
df["Married"] = ['Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No']     
df

Unnamed: 0,Name,Gender,Age,Birth Year,Married
0,Tim Miller,Male,32,1999,Yes
1,Ann Carter,Female,44,1999,Yes
2,Ellen Lee,Female,21,1999,No
3,Sam Carr,Male,19,1999,No
4,Al Ball,Male,45,1999,Yes
5,Carl Zee,Male,27,1999,Yes
6,Sara Martin,Female,39,1999,No


## Selection and Filtering
### Column selection

In [23]:
x = np.arange(100).reshape(10,10)
print(type(x))

<class 'numpy.ndarray'>


In [24]:
x

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [25]:
# create a new DataFrame
data = pd.DataFrame(np.arange(100).reshape(10,10), 
                    columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', \
                               'i', 'j'])
data

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [26]:
data['a']

0     0
1    10
2    20
3    30
4    40
5    50
6    60
7    70
8    80
9    90
Name: a, dtype: int64

In [27]:
# providing a list selects multiple columns
data[["a", "e", "j"]]

Unnamed: 0,a,e,j
0,0,4,9
1,10,14,19
2,20,24,29
3,30,34,39
4,40,44,49
5,50,54,59
6,60,64,69
7,70,74,79
8,80,84,89
9,90,94,99


In [28]:
# define a particular order
data[["j", "e", "a"]]

Unnamed: 0,j,e,a
0,9,4,0
1,19,14,10
2,29,24,20
3,39,34,30
4,49,44,40
5,59,54,50
6,69,64,60
7,79,74,70
8,89,84,80
9,99,94,90


### Row selection

In [29]:
# use slice syntax to select rows
data[:1]

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,0,1,2,3,4,5,6,7,8,9


In [30]:
data[5:9]

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89


In [31]:
# boolean operation on column
data["j"] > 40

0    False
1    False
2    False
3    False
4     True
5     True
6     True
7     True
8     True
9     True
Name: j, dtype: bool

In [32]:
# boolean selection for all rows where column j > 40
data[data["j"] > 40]

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


### Row and Column selection with loc
<b>Allows you to select a subset of the rows and columns using the label/name of the row/column</b>

In [33]:
data

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [34]:
# loc implies the name/label of the row and column
# note with loc the index is inclusive
data.loc[:5, "b"]

0     1
1    11
2    21
3    31
4    41
5    51
Name: b, dtype: int64

In [35]:
# consecutive (loc selection is inclusive)
data.loc[6:, 'a':'e']

Unnamed: 0,a,b,c,d,e
6,60,61,62,63,64
7,70,71,72,73,74
8,80,81,82,83,84
9,90,91,92,93,94


In [36]:
# not consecutive
data.loc[:, ['c', 'f', 'i']] 

Unnamed: 0,c,f,i
0,2,5,8
1,12,15,18
2,22,25,28
3,32,35,38
4,42,45,48
5,52,55,58
6,62,65,68
7,72,75,78
8,82,85,88
9,92,95,98


### Row and Column selection with iloc
<b>Allows you to select a subset of the rows and columns using the integer/index position of the row/column</b>

In [37]:
# iloc is for integer/index selection  (iloc selection is exclusive)
data.iloc[:5, 2:5]

Unnamed: 0,c,d,e
0,2,3,4
1,12,13,14
2,22,23,24
3,32,33,34
4,42,43,44


In [38]:
# gives you a row, assumes all of the columns
data.iloc[4]

a    40
b    41
c    42
d    43
e    44
f    45
g    46
h    47
i    48
j    49
Name: 4, dtype: int64

In [39]:
# returns selections in the order listed
# rows, 5, 0 and 3 and columns 9, 5, 0
data.iloc[[5, 0, 3], [9, 5, 0]]  

Unnamed: 0,j,f,a
5,59,55,50
0,9,5,0
3,39,35,30


## Write out the dataframe to .csv file

In [40]:
# to save your cleaned data to file
df.to_csv("new_filename.csv")

## Dataframe from a file

In [41]:
# Read csv file into a pandas dataframe
# Note there is no header column in this file so define it
iris_data = pd.read_csv("../data/iris.csv", 
                        names = ["sepal_l", "sepal_w", "petal_l", \
                                 "petal_w", "class"])

In [42]:
# default is show first 5 rows
iris_data.head()

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [43]:
iris_data.shape

(150, 5)

In [44]:
iris_data.tail(10)

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w,class
140,6.7,3.1,5.6,2.4,Iris-virginica
141,6.9,3.1,5.1,2.3,Iris-virginica
142,5.8,2.7,5.1,1.9,Iris-virginica
143,6.8,3.2,5.9,2.3,Iris-virginica
144,6.7,3.3,5.7,2.5,Iris-virginica
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [45]:
iris_data['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [46]:
# Get rows 0, 50, 100
iris_data.iloc[[0,50,100]]

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w,class
0,5.1,3.5,1.4,0.2,Iris-setosa
50,7.0,3.2,4.7,1.4,Iris-versicolor
100,6.3,3.3,6.0,2.5,Iris-virginica


In [47]:
# For sepal_l < 5 or > 7 get sepal_l, petal_l and class
iris_data.loc[(iris_data["sepal_l"] > 7) | \
              (iris_data["sepal_l"] < 5), \
              [ 'sepal_l', 'petal_l', 'class']]

Unnamed: 0,sepal_l,petal_l,class
1,4.9,1.4,Iris-setosa
2,4.7,1.3,Iris-setosa
3,4.6,1.5,Iris-setosa
6,4.6,1.4,Iris-setosa
8,4.4,1.4,Iris-setosa
9,4.9,1.5,Iris-setosa
11,4.8,1.6,Iris-setosa
12,4.8,1.4,Iris-setosa
13,4.3,1.1,Iris-setosa
22,4.6,1.0,Iris-setosa


In [48]:
# Get count of unique values in column
iris_data['class'].value_counts()

Iris-versicolor    50
Iris-virginica     50
Iris-setosa        50
Name: class, dtype: int64

In [49]:
# Get count for each column
iris_data.count()

sepal_l    150
sepal_w    150
petal_l    150
petal_w    150
class      150
dtype: int64

# Basic Statistics

In [50]:
# get dataframe statistics
iris_data.describe()

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [51]:
iris_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal_l,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal_w,150.0,3.054,0.433594,2.0,2.8,3.0,3.3,4.4
petal_l,150.0,3.758667,1.76442,1.0,1.6,4.35,5.1,6.9
petal_w,150.0,1.198667,0.763161,0.1,0.3,1.3,1.8,2.5


## Correlation Coefficients
<span style="font-family:times, serif; font-size:14pt; font-style:bold">
<ul>
<li>0:  two variables have no correlation</li>
<li>-1: two variables have negative correlation</li>
<li>1:  two variables have positive correlation</li>
</ul>
</span>

In [52]:
iris_data.corr()

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w
sepal_l,1.0,-0.109369,0.871754,0.817954
sepal_w,-0.109369,1.0,-0.420516,-0.356544
petal_l,0.871754,-0.420516,1.0,0.962757
petal_w,0.817954,-0.356544,0.962757,1.0


## Convert the flower types to numbers

In [53]:
mapping = {'Iris-setosa' : 0, 'Iris-versicolor' : 1, 'Iris-virginica' : 2}
mapping

{'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}

In [54]:
iris_data['TypesNum'] = iris_data['class'].map(mapping)

In [55]:
iris_data.head()

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w,class,TypesNum
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0


In [56]:
iris_data['TypesNum'].unique()

array([0, 1, 2])

In [57]:
iris_data['TypesNum'].value_counts()

2    50
1    50
0    50
Name: TypesNum, dtype: int64