# 5. Getting Started with pandas

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt       # 그래프를 세부적으로 그리기 위한 옵션 설정
import matplotlib as mpl              # 한글폰트, 글씨체, 음수기호 등 전체적인 옵션 조정

import missingno as msno
%matplotlib inline      
# NoteBook 내에 그래프 그리기
# %matplotlib tk 는 별도의 창에 그래프를 그리고 직접 그래프의 속성을 변경할 수 있음

# 실수형 포맷
pd.options.display.float_format = '{:0,.5f}'.format 
np.random.seed(12345)

mpl.rc('font', family='NanumGothic')  # 한글 폰트 설정
                                        # 윈도우 폰트 위치 - C:\Windows\Fonts
mpl.rcParams['axes.unicode_minus'] = False

plt.figure(figsize=(10,5))              # 그래프 사이즈 설정
plt.rc('figure', figsize=(10, 6))

<Figure size 720x360 with 0 Axes>

In [16]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

## 5.1 Introduction to pandas Data Structures

### Series

A Series is a one-dimensional array-like object containing a sequence of values (of
similar types to NumPy types) and an associated array of data labels, called its index

In [7]:
# Series 생성
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [8]:
# Series values 와 index 조회
obj.values, obj.index

(array([ 4,  7, -5,  3], dtype=int64), RangeIndex(start=0, stop=4, step=1))

In [9]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [13]:
# Compared with NumPy arrays, you can use labels in the index when selecting single
# values or a set of values:


obj2['a'], obj2[['c', 'a', 'd']]

(-5,
 c    3
 a   -5
 d    4
 dtype: int64)

In [16]:
obj2[obj2 > 0], obj2 * 2, np.exp(obj2)

(d    4
 b    7
 c    3
 dtype: int64,
 d     8
 b    14
 a   -10
 c     6
 dtype: int64,
 d      54.59815
 b   1,096.63316
 a       0.00674
 c      20.08554
 dtype: float64)

In [18]:
'b' in obj2, 'e' in obj2

(True, False)

In [22]:
# Should you have data contained in a Python dict, you can create a Series from it by
# passing the dict:

sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [23]:
  states = ['California', 'Ohio', 'Oregon', 'Texas']

  obj4 = pd.Series(data=sdata, index=states)
  obj4

California            NaN
Ohio         35,000.00000
Oregon       16,000.00000
Texas        71,000.00000
dtype: float64

In [27]:
# I will use the terms “missing” or “NA” interchangeably to refer to missing data. The
# isnull and notnull functions in pandas should be used to detect missing data:

pd.isnull(obj4), pd.notnull(obj4)


(California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool,
 California    False
 Ohio           True
 Oregon         True
 Texas          True
 dtype: bool)

In [28]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [33]:
# A useful Series feature for many applications is that it automatically aligns by index
# label in arithmetic operations:

print(obj3)
print(obj4)
print(obj3 + obj4)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California            NaN
Ohio         35,000.00000
Oregon       16,000.00000
Texas        71,000.00000
dtype: float64
California             NaN
Ohio          70,000.00000
Oregon        32,000.00000
Texas        142,000.00000
Utah                   NaN
dtype: float64


In [36]:
obj4.name = 'population'
obj4.index.name =  'state'
obj4

state
California            NaN
Ohio         35,000.00000
Oregon       16,000.00000
Texas        71,000.00000
Name: population, dtype: float64

In [37]:
# A Series’s index can be altered in-place by assignment:

obj

0    4
1    7
2   -5
3    3
dtype: int64

In [39]:
obj.index = ['Bob','Steve','Jeff','Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

A DataFrame represents a rectangular table of data and contains an ordered collec‐
tion of columns, each of which can be a different value type (numeric, string,
boolean, etc.).

In [40]:
# Dict를이용한 DataFrame 생성
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [41]:
df.head(), df.tail()

(    state  year     pop
 0    Ohio  2000 1.50000
 1    Ohio  2001 1.70000
 2    Ohio  2002 3.60000
 3  Nevada  2001 2.40000
 4  Nevada  2002 2.90000,
     state  year     pop
 1    Ohio  2001 1.70000
 2    Ohio  2002 3.60000
 3  Nevada  2001 2.40000
 4  Nevada  2002 2.90000
 5  Nevada  2003 3.20000)

In [43]:
# Dict data를 이용하여 DataFrame을 만들고 컬럼의 순서를 재조정
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [44]:
# If you pass a column that isn’t contained in the dict, it will appear with missing values
# in the result:

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

df2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [45]:
# A column in a DataFrame can be retrieved as a Series either by dict-like notation or
# by attribute:

df2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [46]:
df2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [48]:
 df2.loc['three']

year       2002
state      Ohio
pop     3.60000
debt        NaN
Name: three, dtype: object

In [49]:
# Columns can be modified by assignment. For example, the empty 'debt' column
# could be assigned a scalar value or an array of values:

df2['debt'] = 16.5
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [54]:
df2['debt'] = np.arange(6.)
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [55]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
val

two    -1.20000
four   -1.50000
five   -1.70000
dtype: float64

In [57]:
df2['debt'] = val # 인덱스가 동일한 경우 갱신, 인덱스가 없으면 NaN
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [59]:
df2['eastern'] = df2.state == 'Ohio'
df2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [60]:
# The del method can then be used to remove this column:
del df2['eastern']

In [62]:
df2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [64]:
# Another common form of data is a nested dict of dicts:
# If the nested dict is passed to the DataFrame, pandas will interpret the outer dict keys
# as the columns and the inner keys as the row indices:


pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

df3 = pd.DataFrame(pop)
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [65]:
# You can transpose the DataFrame (swap rows and columns) with similar syntax to a
# NumPy array:

df3.T


Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [66]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [67]:
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [68]:
# Dicts of Series are treated in much the same way:

pdata = {'Ohio': df3['Ohio'][:-1],
         'Nevada': df3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [69]:
 df3.index.name = 'year'
 df3.columns.name = 'state'
 df3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [72]:
df3.sort_index(inplace=True)
df3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [73]:
df3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [74]:
# If the DataFrame’s columns are different dtypes, the dtype of the values array will be
# chosen to accommodate all of the columns:

df2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

In [76]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, one to six
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    6 non-null      int64  
 1   state   6 non-null      object 
 2   pop     6 non-null      float64
 3   debt    3 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 412.0+ bytes


### Index Objects

In [5]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index, index[1:]

(Index(['a', 'b', 'c'], dtype='object'), Index(['b', 'c'], dtype='object'))

In [6]:
index[1] = 'd' # Type Error : Index does not support mutable operations

TypeError: Index does not support mutable operations

In [7]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [8]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2

0    1.50000
1   -2.50000
2    0.00000
dtype: float64

In [None]:
df3

## 5.2 Essential Functionality

### Reindexing

In [1]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [2]:
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [13]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [14]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [11]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [None]:
# With DataFrame, reindex can alter either the (row) index, columns, or both. When
# passed only a sequence, it reindexes the rows in the result:

frame = pd.DataFrame(np)

pd.Data