# Chapter 3. Pandas - 데이터 구조

In [1]:
import pandas as pd

# 1. Series

In [2]:
# List
sr = pd.Series([1,2,3,4,5])
sr

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
# Tuple
sr = pd.Series((1,2,3,4,5))
sr

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
# Dict
sr = pd.Series({'a' : 1, 'b' : 2, 'c' : 3, 'd' : 4, 'e' : 5})
sr

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [5]:
# Numpy array
import numpy as np
sr = pd.Series(np.array([1,2,3,4,5]))
sr

0    1
1    2
2    3
3    4
4    5
dtype: int32

In [6]:
# Series
sr1 = pd.Series([1,2,3,4,5])
sr2 = pd.Series(sr1)
sr2

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [7]:
# Index
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [8]:
# Index
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [9]:
# Values
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr.values

array([1, 2, 3, 4, 5], dtype=int64)

In [10]:
# Shape
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr.shape

(5,)

In [11]:
# Feature Name
sr1 = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'], name = 'Name2')
sr2 = pd.Series(data = [6,7,8,9,10], index = ['a','b','c','d','e'], name = 'Name2')

df = pd.concat((sr1, sr2), axis = 1)
df

Unnamed: 0,Name2,Name2.1
a,1,6
b,2,7
c,3,8
d,4,9
e,5,10


In [12]:
# Copy
arr = np.array([1,2,3,4,5])
sr1 = pd.Series(arr, copy = False) # 얕은 복사
sr1[0] = 999
print(arr[0])
sr2 = pd.Series(arr, copy = True) # 깊은 복사
sr2[1] = 999
print(arr[1])

999
2


In [13]:
# loc
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr.loc['a']

1

In [14]:
# loc(index1)
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr['a']

1

In [15]:
# loc(index2)
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr[['a','b']]

a    1
b    2
dtype: int64

In [16]:
# iloc
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr.iloc[1:]

b    2
c    3
d    4
e    5
dtype: int64

In [17]:
# Condition
sr[sr > 2]

c    3
d    4
e    5
dtype: int64

In [18]:
# Add data
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr.loc['f'] = 6
sr

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [19]:
# Replace data
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr.loc['b'] = 1
sr

a    1
b    1
c    3
d    4
e    5
dtype: int64

In [20]:
# Drop data
sr = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
sr.drop(['a','b'], inplace = True)
sr

c    3
d    4
e    5
dtype: int64

# 2. DataFrame

In [21]:
# List
data = [['John',25,'New york'],
        ['Emma',28,'Paris'],
        ['Peter',30,'London'],
        ['Lisa',27,'Sydey']]

df = pd.DataFrame(data, columns = ['Name','Age','City'])
df

Unnamed: 0,Name,Age,City
0,John,25,New york
1,Emma,28,Paris
2,Peter,30,London
3,Lisa,27,Sydey


In [22]:
# np.array
arr = np.array([['John',25,'New york'],
                ['Emma',28,'Paris'],
                ['Peter',30,'London'],
                ['Lisa',27,'Sydey']])

print(type(arr))
df = pd.DataFrame(arr, columns = ['Name','Age','City'])
df

<class 'numpy.ndarray'>


Unnamed: 0,Name,Age,City
0,John,25,New york
1,Emma,28,Paris
2,Peter,30,London
3,Lisa,27,Sydey


In [23]:
# Dict
d = {'Name' : ['John','Emma','Peter','Lisa'],
     'Age' : [25, 28, 30, 27],
     'City' : ['Ner york','Paris','London','Sydney']}

df = pd.DataFrame(d)
df

Unnamed: 0,Name,Age,City
0,John,25,Ner york
1,Emma,28,Paris
2,Peter,30,London
3,Lisa,27,Sydney


In [24]:
# Double Dict
double_d = {'Name' : {'a' : 'John','b' : 'Emma','c' : 'Peter', 'd' : 'Lisa'},
            'Age' : {'a' : 25, 'b' : 28, 'c' : 30, 'd' : 27},
            'City' : {'a' : 'Ner york', 'b' : 'Paris', 'c' : 'London', 'd' :'Sydney'}}

df = pd.DataFrame(double_d)
df

Unnamed: 0,Name,Age,City
a,John,25,Ner york
b,Emma,28,Paris
c,Peter,30,London
d,Lisa,27,Sydney


In [25]:
# Series
name = pd.Series(['John','Emma','Peter','Lisa'])
age = pd.Series([25, 28, 30, 27])
city = pd.Series(['Ner york','Paris','London','Sydney'])

df = pd.DataFrame({'Name' : name,
                   'Age' : age,
                   'City' : city})
df

Unnamed: 0,Name,Age,City
0,John,25,Ner york
1,Emma,28,Paris
2,Peter,30,London
3,Lisa,27,Sydney


In [26]:
# Series Transpose
sr1 = pd.Series([1,2,3,4,5], name = 'sr1')
sr2 = pd.Series(['a','b','c','d','e'], name = 'sr2')

df = pd.DataFrame([sr1, sr2])
df.T

Unnamed: 0,sr1,sr2
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e


In [27]:
# Series concatenate
sr1 = pd.Series([1,2,3], name = 'sr1')
sr2 = pd.Series([4,5,6], name = 'sr2')
sr3 = pd.Series([7,8,9], name = 'sr3')

df = pd.concat([sr1, sr2, sr3], axis = 1)
df

Unnamed: 0,sr1,sr2,sr3
0,1,4,7
1,2,5,8
2,3,6,9


In [28]:
# Index
d = {'Name' : ['John','Emma','Peter','Lisa'],
     'Age' : [25, 28, 30, 27],
     'City' : ['Ner york','Paris','London','Sydney']}

df = pd.DataFrame(d, index = ['A','B','C','D'])
df

Unnamed: 0,Name,Age,City
A,John,25,Ner york
B,Emma,28,Paris
C,Peter,30,London
D,Lisa,27,Sydney


In [29]:
# Replace column name
d = {'Name' : ['John','Emma','Peter','Lisa'],
     'Age' : [25, 28, 30, 27],
     'City' : ['Ner york','Paris','London','Sydney']}

df = pd.DataFrame(d, index = ['A','B','C','D'])
df.columns = ['이름','나이','도시']
df

Unnamed: 0,이름,나이,도시
A,John,25,Ner york
B,Emma,28,Paris
C,Peter,30,London
D,Lisa,27,Sydney


## Pandas function

In [30]:
df = pd.DataFrame({'A' : [1,2,3],
                   'B' : [4,5,6]})
df.shape

(3, 2)

In [31]:
df.ndim

2

In [32]:
df.values

array([[1, 4],
       [2, 5],
       [3, 6]], dtype=int64)

In [33]:
df.columns

Index(['A', 'B'], dtype='object')

In [34]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [35]:
df.size

6

In [36]:
df.dtypes

A    int64
B    int64
dtype: object

# 예제

## Load data

In [37]:
# csv
raw_data = pd.read_csv('./Data/for_saving_test.csv', encoding = 'cp949')
raw_data.head(3)

Unnamed: 0,연도,동해,남해,서해,전체
0,1996,17.4629,17.2288,14.436,15.9067
1,1997,17.4116,17.4092,14.8248,16.1526
2,1998,17.5944,18.011,15.2512,16.6044


In [38]:
raw_data.shape

(5, 5)

In [39]:
# csv
raw_data = pd.read_csv('./Data/for_test.csv', encoding = 'cp949', index_col = '연도')
raw_data.head(3)

Unnamed: 0_level_0,동해,남해,서해,전체
연도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996,17.4629,17.2288,14.436,15.9067
1997,17.4116,17.4092,14.8248,16.1526
1998,17.5944,18.011,15.2512,16.6044


In [40]:
# txt
raw_data = pd.read_csv('./Data/KTX.txt', sep = '\t', engine = 'python')
raw_data.head(3)

Unnamed: 0,"KTX_data = {'경부선 KTX': [39060, 39896, 42005, 43621, 41702, 41266, 32427],"
0,"'호남선 KTX': [7313, 6967, 6873, 6626..."
1,"'경전선 KTX': [3627, 4168, 4088, 4424..."
2,"'전라선 KTX': [309, 1771, 1954, 2244,..."


## Export data

In [41]:
# raw_data.to_csv('./Data/export_data.csv')

# 실습

In [42]:
raw_ktx = {'경부선 KTX': [39060, 39896, 42005, 43621, 41702, 41266, 32427],
           '호남선 KTX': [7313, 6967, 6873, 6626, 8675, 10622, 9228],
           '경전선 KTX': [3627, 4168, 4088, 4424, 4606, 4984, 5570],
           '전라선 KTX': [309, 1771, 1954, 2244, 3146, 3945, 5766],
           '동해선 KTX': [np.nan,np.nan, np.nan, np.nan, 2395, 3786, 6667]}
index_list = ['2011', '2012', '2013', '2014', '2015', '2016', '2017']

ktx = pd.DataFrame(raw_ktx, index = index_list)
ktx.head(3)

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,


In [43]:
ktx.tail(3)

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [44]:
ktx.shape

(7, 5)

In [45]:
ktx.ndim

2

In [46]:
ktx[:3]

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,


In [47]:
ktx.iloc[3]

경부선 KTX    43621.0
호남선 KTX     6626.0
경전선 KTX     4424.0
전라선 KTX     2244.0
동해선 KTX        NaN
Name: 2014, dtype: float64

In [48]:
ktx.loc['2014']

경부선 KTX    43621.0
호남선 KTX     6626.0
경전선 KTX     4424.0
전라선 KTX     2244.0
동해선 KTX        NaN
Name: 2014, dtype: float64

In [49]:
ktx.loc['2014' : '2016']

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0


In [50]:
ktx['경부선 KTX']

2011    39060
2012    39896
2013    42005
2014    43621
2015    41702
2016    41266
2017    32427
Name: 경부선 KTX, dtype: int64

In [51]:
ktx['경부선 KTX'][2:5]

2013    42005
2014    43621
2015    41702
Name: 경부선 KTX, dtype: int64

In [52]:
print(ktx['호남선 KTX']['2016'])
print(ktx['호남선 KTX'][5])
print(ktx['호남선 KTX'][-2])
print(ktx['호남선 KTX'].loc['2016'])
print(ktx['호남선 KTX'].iloc[5])

10622
10622
10622
10622
10622


In [53]:
ktx.loc['2016']

경부선 KTX    41266.0
호남선 KTX    10622.0
경전선 KTX     4984.0
전라선 KTX     3945.0
동해선 KTX     3786.0
Name: 2016, dtype: float64

In [54]:
ktx.loc['2016']['호남선 KTX']

10622.0

In [55]:
ktx.T

Unnamed: 0,2011,2012,2013,2014,2015,2016,2017
경부선 KTX,39060.0,39896.0,42005.0,43621.0,41702.0,41266.0,32427.0
호남선 KTX,7313.0,6967.0,6873.0,6626.0,8675.0,10622.0,9228.0
경전선 KTX,3627.0,4168.0,4088.0,4424.0,4606.0,4984.0,5570.0
전라선 KTX,309.0,1771.0,1954.0,2244.0,3146.0,3945.0,5766.0
동해선 KTX,,,,,2395.0,3786.0,6667.0


In [56]:
ktx[['경부선 KTX','경전선 KTX','동해선 KTX','전라선 KTX','호남선 KTX']]

Unnamed: 0,경부선 KTX,경전선 KTX,동해선 KTX,전라선 KTX,호남선 KTX
2011,39060,3627,,309,7313
2012,39896,4168,,1771,6967
2013,42005,4088,,1954,6873
2014,43621,4424,,2244,6626
2015,41702,4606,2395.0,3146,8675
2016,41266,4984,3786.0,3945,10622
2017,32427,5570,6667.0,5766,9228
