In [1]:
import pandas as pd

In [2]:
groceries = pd.Series(data = [30, 6, 'Yes', 'No'], index = ['eggs', 'apples', 'milk','bread'])

groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object

In [3]:
# Print attributes - shape, ndim, and size

print('Groceries has shape:', groceries.shape)
print('Groceries has dimension:', groceries.ndim)
print('Groceries has a total of', groceries.size, 'elements')

Groceries has shape: (4,)
Groceries has dimension: 1
Groceries has a total of 4 elements


In [4]:
# print values, and index

print('The data in Groceries is:', groceries.values)
print('The index of Groceries is:', groceries.index)

The data in Groceries is: [30 6 'Yes' 'No']
The index of Groceries is: Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')


In [5]:
x = 'bananas' in groceries
y = 'bread' in groceries

print('Is bananas an index label in Groceries:', x)
print('Is bread an index label in Groceries:', y)

Is bananas an index label in Groceries: False
Is bread an index label in Groceries: True


## Accessing and Deleting Elements in Pandas Series

In [6]:
# use a single index label

print('How many eggs do we need to buy:', groceries['eggs'])
print()

# access multiple index labels

print('Do we need milk and bread: \n', groceries[['milk','bread']])
print()

# use loc to access multiple index labels

print('How many eggs and apples do we need to buy:\n', groceries.loc[['eggs','apples']])
print()

# multiple numerical indices
print('How many eggs and apples do we need to buy:\n', groceries[[0, 1]])
print()

# use a negative numerical index
print('Do we need bread:\n', groceries[[-1]])
print()

# use a single numerical index
print('How many eggs do we need to buy:', groceries[0])
print()

# use iloc to access multiple numerical indeices
print('Do we need milk and bread:\n', groceries.iloc[[2, 3]])


How many eggs do we need to buy: 30

Do we need milk and bread: 
 milk     Yes
bread     No
dtype: object

How many eggs and apples do we need to buy:
 eggs      30
apples     6
dtype: object

How many eggs and apples do we need to buy:
 eggs      30
apples     6
dtype: object

Do we need bread:
 bread    No
dtype: object

How many eggs do we need to buy: 30

Do we need milk and bread:
 milk     Yes
bread     No
dtype: object


In [7]:
# Mutate elements using index labels

# display the original grocery list
print('Original Grocery List:\n', groceries)

# change the number of eggs to 2
groceries['eggs'] = 2

# display the changed grocery list
print()
print('Modified Grocery List:\n', groceries)

Original Grocery List:
 eggs       30
apples      6
milk      Yes
bread      No
dtype: object

Modified Grocery List:
 eggs        2
apples      6
milk      Yes
bread      No
dtype: object


In [8]:
# delete elements out of placing using drop()

print('Original Grocery List:\n', groceries)

print('We remove apples (out of place):\n', groceries.drop('apples'))

print('Grocery List after removing apples out of place:\n', groceries)

Original Grocery List:
 eggs        2
apples      6
milk      Yes
bread      No
dtype: object
We remove apples (out of place):
 eggs       2
milk     Yes
bread     No
dtype: object
Grocery List after removing apples out of place:
 eggs        2
apples      6
milk      Yes
bread      No
dtype: object


In [9]:
# display the original grocery list
print('Original Grocery List:\n', groceries)

# remove apples from our grocery list in place by setting the inplace keyword to True
groceries.drop('apples', inplace = True)

# remove elements in place the original Series its modeified.

print('Grocery List after removing apples in place:\n', groceries)

Original Grocery List:
 eggs        2
apples      6
milk      Yes
bread      No
dtype: object
Grocery List after removing apples in place:
 eggs       2
milk     Yes
bread     No
dtype: object


## Arithmetic Operations

In [10]:
fruits = pd.Series(data = [10,6,3,], index = ['apples', 'oranges','bananas'])

fruits

apples     10
oranges     6
bananas     3
dtype: int64

In [11]:
# Element-wise basic arithmetic operations

print('Original grocery list of fruits:\n', fruits)

print()
print('fruits + 2:\n', fruits + 2) # add 2 to each item in fruits
print()
print('fruits - 2:\n', fruits - 2) # substract 2 to each item in fruits
print()
print('fruits * 2 :\n', fruits * 2) # multiply each item in fruits by 2
print()
print('fruits / 2 :\n', fruits / 2) # divide each item in fruits by 2
print()

Original grocery list of fruits:
 apples     10
oranges     6
bananas     3
dtype: int64

fruits + 2:
 apples     12
oranges     8
bananas     5
dtype: int64

fruits - 2:
 apples     8
oranges    4
bananas    1
dtype: int64

fruits * 2 :
 apples     20
oranges    12
bananas     6
dtype: int64

fruits / 2 :
 apples     5.0
oranges    3.0
bananas    1.5
dtype: float64



In [12]:
# use mathematical functions from Numpy to operate on Series

import numpy as np

print('Original grocery list of fruits:\n', fruits)

# We apply different mathematical functions to all elements of fruits
print()
print('EXP(X) = \n', np.exp(fruits))
print() 
print('SQRT(X) =\n', np.sqrt(fruits))
print()
print('POW(X,2) =\n',np.power(fruits,2)) # We raise all elements of fruits to the power of 2

Original grocery list of fruits:
 apples     10
oranges     6
bananas     3
dtype: int64

EXP(X) = 
 apples     22026.465795
oranges      403.428793
bananas       20.085537
dtype: float64

SQRT(X) =
 apples     3.162278
oranges    2.449490
bananas    1.732051
dtype: float64

POW(X,2) =
 apples     100
oranges     36
bananas      9
dtype: int64


In [13]:
# perform arithmetic operations on selected elements

print('Original grocery list of fruits:\n', fruits)
print()

print('Amount of bananas + 2 =', fruits['bananas'] + 2)
print()

print('Amount of apples - 2 = ', fruits.iloc[0] - 2)
print()

# multiply apples and oranges by 2
print('We double the amount of apples and oranges:\n', fruits[['apples','oranges']] * 2)
print()

# divide apples and oranges by 2
print('We half the amount of apples and oranges:\n', fruits.loc[['apples', 'oranges']] / 2)

Original grocery list of fruits:
 apples     10
oranges     6
bananas     3
dtype: int64

Amount of bananas + 2 = 5

Amount of apples - 2 =  8

We double the amount of apples and oranges:
 apples     20
oranges    12
dtype: int64

We half the amount of apples and oranges:
 apples     5.0
oranges    3.0
dtype: float64


In [14]:
# perform multiplication on a Series having integer and string elements

groceries * 2

eggs          4
milk     YesYes
bread      NoNo
dtype: object

## Creating Pandas Dataframes

In [15]:
# Create a DataFrame manually

import pandas as pd

# create a dictionary of Pandas Series
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
        'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}

print(type(items))

<class 'dict'>


In [16]:
# create a DataFrame using a dictionary of Series

shopping_carts = pd.DataFrame(items)
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [17]:
# create a dictionary of Pandas Series without indexes

data = {'Bob' : pd.Series([245, 25, 55]),
       'Alice' : pd.Series([40, 110, 500, 45])}

df = pd.DataFrame(data)

df

Unnamed: 0,Bob,Alice
0,245.0,40
1,25.0,110
2,55.0,500
3,,45


In [18]:
# print some information about shopping_carts

print('shopping_carts has shape:', shopping_carts.shape)
print('shopping_carts has dimension:', shopping_carts.ndim)
print('shopping_carts has a total of:', shopping_carts.size, 'elements')
print()
print('The data in shopping_carts is:\n', shopping_carts.values)
print()
print('The row index in shopping_carts is:', shopping_carts.index)
print()
print('The column index in shopping_carts is:', shopping_carts.columns)

shopping_carts has shape: (5, 2)
shopping_carts has dimension: 2
shopping_carts has a total of: 10 elements

The data in shopping_carts is:
 [[245. 500.]
 [ nan  40.]
 [ nan 110.]
 [ 25.  45.]
 [ 55.  nan]]

The row index in shopping_carts is: Index(['bike', 'book', 'glasses', 'pants', 'watch'], dtype='object')

The column index in shopping_carts is: Index(['Bob', 'Alice'], dtype='object')


In [19]:
# create a DataFrame that only has Bob's data

bob_shopping_cart = pd.DataFrame(items, columns=['Bob'])

bob_shopping_cart

Unnamed: 0,Bob
bike,245
pants,25
watch,55


In [20]:
# selecting specific rows of a DataFrame

# create a DataFrame that only has selected items for both Alice and Bob
sel_shopping_cart = pd.DataFrame(items,index = ['pants', 'book'])

sel_shopping_cart

Unnamed: 0,Bob,Alice
pants,25.0,45
book,,40


In [21]:
# create a DataFrame that only has selected items for Alice
alice_sel_shopping_cart = pd.DataFrame(items, index = ['glasses','bike'], columns = ['Alice'])

alice_sel_shopping_cart

Unnamed: 0,Alice
glasses,110
bike,500


In [22]:
# Create dictionary of lists (array)

data = {'Integers' : [1,2,3], 
       'Floats' : [4.5, 8.2, 9.6]}

df = pd.DataFrame(data)

df

Unnamed: 0,Integers,Floats
0,1,4.5
1,2,8.2
2,3,9.6


In [23]:
# create the row index
df = pd.DataFrame(data, index = ['label 1', 'label 2', 'label 3'])

df

Unnamed: 0,Integers,Floats
label 1,1,4.5
label 2,2,8.2
label 3,3,9.6


In [24]:
# create a list of Python dictionaries

items2 = [{'bikes' : 20, 'pants': 30, 'watches': 35}, {'watches': 10, 'glasses': 50
                                                      , 'bikes': 15, 'pants':5}]

store_items = pd.DataFrame(items2)

# add the row index
store_items = pd.DataFrame(items2, index = ['store 1', 'store 2'])


store_items

Unnamed: 0,bikes,pants,watches,glasses
store 1,20,30,35,
store 2,15,5,10,50.0


## Accessing Elements in Pandas DataFrames


In [25]:
print()
print('How many bikes are in each store:\n', store_items[['bikes']])
print()
print('How many bikes and pants are in each store:\n', store_items[['bikes', 'pants']])
print()
print('What items are in Store 1:\n', store_items.loc[['store 1']])
print()
print('How many bikes are in Store 2:', store_items['bikes']['store 2'])


How many bikes are in each store:
          bikes
store 1     20
store 2     15

How many bikes and pants are in each store:
          bikes  pants
store 1     20     30
store 2     15      5

What items are in Store 1:
          bikes  pants  watches  glasses
store 1     20     30       35      NaN

How many bikes are in Store 2: 15


In [26]:
# add a column to an existing DataFrame

store_items['shirts'] = [15,2]

store_items


Unnamed: 0,bikes,pants,watches,glasses,shirts
store 1,20,30,35,,15
store 2,15,5,10,50.0,2


In [27]:
# add a new column called suits by adding the number of shirts and pants

store_items['suits'] = store_items['pants'] + store_items['shirts']
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
store 1,20,30,35,,15,45
store 2,15,5,10,50.0,2,7


In [28]:
# create a new items dictionary

new_items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4}]

new_store = pd.DataFrame(new_items, index = ['store 3'])


# append the row to the DataFrame

store_items = store_items.append(new_store)
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
store 1,20,30,35,,15.0,45.0
store 2,15,5,10,50.0,2.0,7.0
store 3,20,30,35,4.0,,


In [29]:
# add new column using data from particular rows in the watches column

store_items['new watches'] = store_items['watches'][1:]

store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits,new watches
store 1,20,30,35,,15.0,45.0,
store 2,15,5,10,50.0,2.0,7.0,10.0
store 3,20,30,35,4.0,,,35.0


In [30]:
# dataframe.insert(loc, label, data) allows us to insert a new column in the
# dataframe at location loc, with the given column albel, and given data

store_items.insert(4, 'shoes', [8,5,0])

store_items

Unnamed: 0,bikes,pants,watches,glasses,shoes,shirts,suits,new watches
store 1,20,30,35,,8,15.0,45.0,
store 2,15,5,10,50.0,5,2.0,7.0,10.0
store 3,20,30,35,4.0,0,,,35.0


In [31]:
# delete one column from a DataFrame

store_items.pop('new watches')

store_items

Unnamed: 0,bikes,pants,watches,glasses,shoes,shirts,suits
store 1,20,30,35,,8,15.0,45.0
store 2,15,5,10,50.0,5,2.0,7.0
store 3,20,30,35,4.0,0,,


In [32]:
# delete multiple columns and shoes columns

store_items = store_items.drop(['watches', 'shoes'], axis = 1)

store_items

Unnamed: 0,bikes,pants,glasses,shirts,suits
store 1,20,30,,15.0,45.0
store 2,15,5,50.0,2.0,7.0
store 3,20,30,4.0,,


In [33]:
# delete rows from a DataFrame

store_items = store_items.drop(['store 2', 'store 1'], axis = 0)

store_items

Unnamed: 0,bikes,pants,glasses,shirts,suits
store 3,20,30,4.0,,


In [35]:
# modify the column label

store_items = store_items.rename(columns = {'bikes':'hats'})

store_items

Unnamed: 0,hats,pants,glasses,shirts,suits
store 3,20,30,4.0,,


In [36]:
# change the row label from store 3 to last store

store_items = store_items.rename(index = {'store 3': 'last store'})

store_items

Unnamed: 0,hats,pants,glasses,shirts,suits
last store,20,30,4.0,,


In [37]:
# change the row index to be the data in the pants column

store_items = store_items.set_index('pants')

store_items

Unnamed: 0_level_0,hats,glasses,shirts,suits
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30,20,4.0,,


## Dealing with NaN

While any given dataset can have many types of bad data, such as outliers or incorrect values, the type of bad data we encounter almost always is missing values. 

In [38]:
# Create a DataFrame

# We create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]

store_items = pd.DataFrame(items2, index = ['store 1', 'store 2', 'store 3'])

store_items

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,,10,,4.0


In [39]:
# count the number of NaN values in store_items
x = store_items.isnull().sum().sum()

print('Number of NaN values in our DataFrame:', x)

# .isnull() method returns a Boolean DataFrame of the same size as store_items 
# and indicates with True the elements that have NaN

# We can count  the number of logical True values we use the .sum() method twice
# the first sum returns a Pandas Series with the sums of logical True values
# along columns

Number of NaN values in our DataFrame: 3


In [40]:
store_items.isnull()

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,False,False,False,False,False,False,True
store 2,False,False,False,False,False,False,False
store 3,False,False,False,True,False,True,False


In [41]:
store_items.isnull().sum()

bikes      0
pants      0
watches    0
shirts     1
shoes      0
suits      1
glasses    1
dtype: int64

In [42]:
print()
print('Number of non-NaN values in the columns of our DataFrame:\n', store_items.count())


Number of non-NaN values in the columns of our DataFrame:
 bikes      3
pants      3
watches    3
shirts     2
shoes      3
suits      2
glasses    2
dtype: int64


### Eliminating NaN values
In general, we have two options, we can either delete or replace the NaN values. 

We will start by learning how to eliminate rows or columns from our DataFrame that contain any NaN values. 

The .dropna(axis) method eliminates any rows with NaN values when axis = 0 is used values. The .dropna(axis) method eliminates any rows with NaN values when axis = 0 is used and will eliminate any columns with NaN values when axis = 1 is used. 

In [43]:
# Drop rows having NaN values
store_items.dropna(axis = 0)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 2,15,5,10,2.0,5,7.0,50.0


In [45]:
# Drop any columns with naN values
store_items.dropna(axis = 1)

Unnamed: 0,bikes,pants,watches,shoes
store 1,20,30,35,8
store 2,15,5,10,5
store 3,20,30,35,10


In [46]:
# substituting NaN values
# with 0

store_items.fillna(0)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,0.0
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,0.0,10,0.0,4.0


When replacing NaN values with forward filling, we can use previous values taken from columns or rows. 

The .fillna(method = 'ffill', axis) will use the forward filling (ffill) method to replace NaN values using the previous known value along the given axis. 

In [47]:
# forward fill NaN values down (axis = 0) the dataframe

store_items.fillna(method = 'ffill', axis = 0)

# NaN value in store 1 didn't get replaced because there are no previous values
# in this column.  
# 

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,2.0,10,7.0,4.0


In [48]:
# forward fill NaN values across (axis = 1) the dataframe

# replace NaN values with the previous value in the row
store_items.fillna(method = 'ffill', axis = 1)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20.0,30.0,35.0,15.0,8.0,45.0,45.0
store 2,15.0,5.0,10.0,2.0,5.0,7.0,50.0
store 3,20.0,30.0,35.0,35.0,10.0,10.0,4.0


In [50]:
# Backward fill NaN values down (axis = 0) the dataframe
# replace NaN values with the next value in the column

store_items.fillna(method = 'backfill', axis = 0)

# NaN values in store 3 didnt' get replaced. That's because there are no
# next values in these columns, since these NaN values are the last values 
# in those columns

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,50.0
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,,10,,4.0


In [51]:
# backward fill NaN values across (axis = 1) the dataframe
store_items.fillna(method = 'backfill', axis = 1)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20.0,30.0,35.0,15.0,8.0,45.0,
store 2,15.0,5.0,10.0,2.0,5.0,7.0,50.0
store 3,20.0,30.0,35.0,10.0,10.0,4.0,4.0


In [52]:
# interpolate (estimate) NaN values down (axis = 0) the dataframe
store_items.interpolate(method = 'linear', axis = 0)

# store 1 didn't get replaced. That's because the NaN value is the first
# value in that column, and since there is no data before it, 
# the interpolation function can't calculate a value. 

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,2.0,10,7.0,4.0


In [53]:
# interpolate (estimate) NaN values across (axis = 1) the dataframe
store_items.interpolate(method = 'linear', axis = 1)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20.0,30.0,35.0,15.0,8.0,45.0,45.0
store 2,15.0,5.0,10.0,2.0,5.0,7.0,50.0
store 3,20.0,30.0,35.0,22.5,10.0,7.0,4.0
