In [587]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [588]:
import pandas as pd
import numpy as np

In [589]:
# 1. Create a DataFrame from a dict of equally length list or NumPy arrays
data = {
        'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
        }

In [590]:
frame_1 = pd.DataFrame(data)
frame_1

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [591]:
# 2. For large DataFrames head method by default selects only the first five rows
frame_1.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [592]:
# 3. Pass column parameter while creating DataFrame
# 3.1 If we specify a sequence of columns, DataFrames will be arranges in that order
frame_2 = pd.DataFrame(data = data, columns = ['year', 'state', 'pop'])
frame_2

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [593]:
# 3.2 If we specify a column that isn't contained in dic, it'll appear with missing values in DataFrame
frame_2 = pd.DataFrame(data = data, columns = ['year', 'state', 'pop', 'debt'])
frame_2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [594]:
# 4 Pass index parameter while creating DataFrame
frame_2 = pd.DataFrame(data = data, columns = ['year', 'state', 'pop'],
                       index = ['one', 'two', 'three', 'four', 'five', 'six'])
frame_2

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


In [595]:
# 5 Retrieve Column names in DataFrame
# The columns returned from indexing a DataFrame is a view on the underlying data, not a copy.
# Thus, any in-place modifications to the Series will be reflected in the DataFrame
frame_2.columns

Index(['year', 'state', 'pop'], dtype='object')

In [596]:
# 6 Retrieve Column in DataFrame
# 6.1 Retrieve Column in DataFrame by dict like notation
frame_2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [597]:
# 6.2 Retrieve Column in DataFrame by attribute
frame_2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [598]:
# 7. Retrieve rows in a DataFrame
# Rows can be retrieved in a DataFrame by position or name with specisl loc attribute
frame_2.loc['one']

year     2000
state    Ohio
pop       1.5
Name: one, dtype: object

In [599]:
# 8. Columns can be modified by assignment
frame_2

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


In [600]:
# 8.1 Update all the columns value with same value
frame_2['debt'] = 16.5
frame_2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [601]:
# 8.1 Update all the columns value with array like object
# Length of list or array must match the length of DataFrame
frame_2['debt'] = np.arange(1, 7, 1)
frame_2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,1
two,2001,Ohio,1.7,2
three,2002,Ohio,3.6,3
four,2001,Nevada,2.4,4
five,2002,Nevada,2.9,5
six,2003,Nevada,3.2,6


In [602]:
# 8.2 If we assign a  Series, it's label will be realigned exactly to the DataFrame index,
# inserting missing values in any holes
frame_2['debt'] = pd.Series(data = [-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame_2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [603]:
# 9 Assigning a column that doesn't exist will create a new column
frame_2['eastern'] = frame_2.state == 'Ohio'
frame_2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [604]:
# 10 del method can be used to remove column
del frame_2['eastern']
frame_2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [605]:
# 11 Creat a DataFrame from nested dict
# If nested dicts is passed to DataFrame, Pandas will detect outer keys as columns names
# and inner keys as the row indices
pop = {
        'Nevada': {2001: 2.4, 2002: 2.9},
        'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}
        }
frame_3 = pd.DataFrame(pop)
frame_3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [606]:
frame_3.index

Int64Index([2000, 2001, 2002], dtype='int64')

In [607]:
# 12 Transpose d DataFrame
frame_3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [608]:
# 13 The keys in the inner dict are combined and sorted to form the index in result.
pd.DataFrame(data = pop, index = pd.Series([2001, 2002, 2003]))

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [609]:
# 14 Create DataFrame from Dict of Series
pdata = {
        'Ohio': frame_3['Ohio'][:-1],
        'Nevada':frame_3['Nevada'][:2]
        }
pdata

{'Ohio': 2000    1.5
 2001    1.7
 Name: Ohio, dtype: float64, 'Nevada': 2000    NaN
 2001    2.4
 Name: Nevada, dtype: float64}

In [610]:
frame_4 = pd.DataFrame(data = pdata)
frame_4

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [611]:
# 15 If a DataFrame's index and columns have their name attribute set, these will also be displayed
frame_4.index.name = 'year'
frame_4.columns.name = 'State'
frame_4

State,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4


In [612]:
# 16 The Values attribute returns the data contained in the DataFrame as a two-dimensional ndarray
frame_4.values

array([[1.5, nan],
       [1.7, 2.4]])

In [613]:
# 17 If a DataFrame's columns are different dtypes, the dtype of values array will be choosen to
# accomodate all of the column
frame_4.dtypes

State
Ohio      float64
Nevada    float64
dtype: object