In [3]:
import pandas as pd
import numpy as np

## Series
  * 1차원 구조로 되어 있는 데이터
  > pd.Series(data=None, index=None, dtype=None, name=None, copy=False)

### Series 생성

In [46]:
data = {'a':1, 'b':2, 'c':3} # 인덱스가 있으므로 바로 Series 생성 가능
pd.Series(data=data, dtype=np.int16, name='dict')

a    1
b    2
c    3
Name: dict, dtype: int16

In [12]:
# 스칼라 값인 경우 인덱스를 제공해줘야 함
pd.Series(5.0, index=['a', 'b', 'c', 'd', 'e', 'f']) # Broadcasting 가능

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
f    5.0
dtype: float64

In [14]:
s = pd.Series(np.random.randint(0, 100, 6), ['a', 'b', 'c', 'd', 'e', 'f'])
s

a    23
b    27
c    60
d    94
e    42
f    60
dtype: int64

### 슬라이싱/인덱스
> Numpy와 같이 슬라이싱/인덱스와 같은 작업이 가능함

In [20]:
s[2:5]

c    60
d    94
e    42
dtype: int64

In [99]:
s[s>s.median()], s.median()

(b    27
 c    60
 f    30
 dtype: int64, 23.5)

In [17]:
'a' in s, 'x' in s

(True, False)

In [100]:
s.get('a')

15

In [19]:
s.index, s.values

(Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object'),
 array([23, 27, 60, 94, 42, 60]))

In [21]:
s[[4, 3, 1]]

e    42
d    94
b    27
dtype: int64

In [32]:
filter = range(4, 2, -1)
s[filter]

e    42
d    94
dtype: int64

In [33]:
s[0] = 15 # Series 내부의 값은 변경 가능
s

a    15
b    27
c    60
d    94
e    42
f    60
dtype: int64

In [35]:
s[[3, 4, 5]] = [10, 20, 30]
s

a    15
b    27
c    60
d    10
e    20
f    30
dtype: int64

In [38]:
np.exp(s.divide(10))

a      4.481689
b     14.879732
c    403.428793
d      2.718282
e      7.389056
f     20.085537
dtype: float64

In [101]:
s.to_numpy()

array([15, 27, 60, 10, 20, 30])

## DataFrame

In [45]:
data_ = {
    "one": pd.Series([5, 6, 7]),
    "two": pd.Series([4, 3, 2])
}
pd.DataFrame(data=data_)

Unnamed: 0,one,two
0,5,4
1,6,3
2,7,2


In [43]:
data2 = [{"a":1, "b":2}, {"a":7, "b":8, "c":9}]
pd.DataFrame(data=data2, index=["first", "second"])

Unnamed: 0,a,b,c
first,1,2,
second,7,8,9.0


In [44]:
ser = pd.Series(range(3), index=list("abc"), name="ser")
pd.DataFrame(ser)

Unnamed: 0,ser
a,0
b,1
c,2


In [49]:
pd.DataFrame(pd.Series(data={"a":"A", "b":"B", "c":"C"}, dtype='object', name='dict'))

Unnamed: 0,dict
a,A
b,B
c,C


### DataFrame.from_dict

In [102]:
data = {"row_1": [3, 2, 1, 0], "row_2": ['a', 'b', 'c', 'd']}
df = pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
row_1,3,2,1,0
row_2,a,b,c,d


## 데이터 변환
  > 외부 파일로 추출
  * pd.DataFrame.to_parquet()
  * pd.DataFrame.to_csv()
  * pd.DataFrame.to_excel()   
  
  > 파이썬 내부 데이터로 추출
  * pd.DataFrame.to_dict()
  * pd.DataFrame.to_json()

In [103]:
df.to_dict()

{'A': {'row_1': 3, 'row_2': 'a'},
 'B': {'row_1': 2, 'row_2': 'b'},
 'C': {'row_1': 1, 'row_2': 'c'},
 'D': {'row_1': 0, 'row_2': 'd'}}

In [104]:
df.to_dict('series')

{'A': row_1    3
 row_2    a
 Name: A, dtype: object, 'B': row_1    2
 row_2    b
 Name: B, dtype: object, 'C': row_1    1
 row_2    c
 Name: C, dtype: object, 'D': row_1    0
 row_2    d
 Name: D, dtype: object}

In [106]:
df.to_dict('records')

[{'A': 3, 'B': 2, 'C': 1, 'D': 0}, {'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd'}]

In [107]:
df.to_json()

'{"A":{"row_1":3,"row_2":"a"},"B":{"row_1":2,"row_2":"b"},"C":{"row_1":1,"row_2":"c"},"D":{"row_1":0,"row_2":"d"}}'

In [108]:
df.to_json(orient="records")

'[{"A":3,"B":2,"C":1,"D":0},{"A":"a","B":"b","C":"c","D":"d"}]'

## DataFrame 연산

In [109]:
data = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"])
}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [77]:
df["three"] = df["one"] + df["two"]
df

Unnamed: 0,one,two,three
a,1.0,1.0,2.0
b,2.0,2.0,4.0
c,3.0,3.0,6.0
d,,4.0,


In [78]:
list(df['one'] > 2)

[False, False, True, False]

In [79]:
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,2.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,6.0,True
d,,4.0,,False


### del/pop

In [80]:
del df["two"] # 열 삭제
df

Unnamed: 0,one,three,flag
a,1.0,2.0,False
b,2.0,4.0,False
c,3.0,6.0,True
d,,,False


In [81]:
three = df.pop("three") # 열 pop
three

a    2.0
b    4.0
c    6.0
d    NaN
Name: three, dtype: float64

In [82]:
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [83]:
df["foo"] = "bar"
df

Unnamed: 0,one,flag,foo
a,1.0,False,bar
b,2.0,False,bar
c,3.0,True,bar
d,,False,bar


In [84]:
df['one_trunc'] = df['one'][:2]
df

Unnamed: 0,one,flag,foo,one_trunc
a,1.0,False,bar,1.0
b,2.0,False,bar,2.0
c,3.0,True,bar,
d,,False,bar,


In [88]:
df.insert(loc=1, column='bar', value=df['one']) # Column 위치, Column 위치, 값
df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


In [95]:
df['insert'] = [1, 2, 4, 3]
df.columns = ['one', 'insert', 'bar', 'flag', 'foo', 'one_trunc']
df

Unnamed: 0,one,insert,bar,flag,foo,one_trunc
a,1.0,1,False,bar,1.0,1
b,2.0,2,False,bar,2.0,2
c,3.0,4,True,bar,,3
d,,3,False,bar,,4


In [97]:
df.index = ["d", "b", "a", "c"] # index는 실제 데이터의 순서를 담지 않는다
df

Unnamed: 0,one,insert,bar,flag,foo,one_trunc
d,1.0,1,False,bar,1.0,1
b,2.0,2,False,bar,2.0,2
a,3.0,4,True,bar,,3
c,,3,False,bar,,4


### 데이터 선택

In [110]:
import seaborn as sns

iris = sns.load_dataset("iris")
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [265]:
help(sns.load_dataset)

Help on function load_dataset in module seaborn.utils:

load_dataset(name, cache=True, data_home=None, **kws)
    Load an example dataset from the online repository (requires internet).
    
    This function provides quick access to a small number of example datasets
    that are useful for documenting seaborn or generating reproducible examples
    for bug reports. It is not necessary for normal usage.
    
    Note that some of the datasets have a small amount of preprocessing applied
    to define a proper ordering for categorical variables.
    
    Use :func:`get_dataset_names` to see a list of available datasets.
    
    Parameters
    ----------
    name : str
        Name of the dataset (``{name}.csv`` on
        https://github.com/mwaskom/seaborn-data).
    cache : boolean, optional
        If True, try to load from the local cache first, and save to the cache
        if a download is required.
    data_home : string, optional
        The directory in which to cache data; se

* isin(values)
> 해당 데이터가 values 내에 있는지를 구분






In [164]:
iris.iloc[0]

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

In [113]:
cond = iris['petal_length'].isin([1.3, 1.4])
iris[cond][:5]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
6,4.6,3.4,1.4,0.3,setosa


In [154]:
cond2 = iris['petal_length'] > 3.0
iris[cond2][:5]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor


In [143]:
iris[['petal_length', 'sepal_length']]

Unnamed: 0,petal_length,sepal_length
0,1.4,5.1
1,1.4,4.9
2,1.3,4.7
3,1.5,4.6
4,1.4,5.0
...,...,...
145,5.2,6.7
146,5.0,6.3
147,5.2,6.5
148,5.4,6.2


In [155]:
iris[~cond][:5]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
3,4.6,3.1,1.5,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
7,5.0,3.4,1.5,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa
10,5.4,3.7,1.5,0.2,setosa


* iloc
  > 행, 열 번호(숫자)를 통해 데이터를 가져온다

In [157]:
iris.iloc[0]

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

In [158]:
iris.iloc[[1, 2, -1]]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
149,5.9,3.0,5.1,1.8,virginica


In [160]:
iris.iloc[:5, :3] # 행 갯수, 열 갯수

Unnamed: 0,sepal_length,sepal_width,petal_length
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4


In [162]:
iris.iloc[[2, 5], [1, 2]]

Unnamed: 0,sepal_width,petal_length
2,3.2,1.3
5,3.9,1.7


* loc
  > 인덱스 값을 통해 데이터를 가져온다   
  > Column의 경우 숫자로 접근 불가능, 인덱스로만 접근 가능

In [183]:
iris.loc[4]

sepal_length       5.0
sepal_width        3.6
petal_length       1.4
petal_width        0.2
species         setosa
Name: 4, dtype: object

In [167]:
# loc에서는 마지막 데이터를 가져오기 위해 음수 값을 쓸 수 없음
iris.loc[[1, 2, 3]]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa


In [187]:
iris.loc[:,['sepal_length', 'sepal_width']][:5]

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6


In [185]:
iris.loc[[140, 49],['sepal_width', 'petal_length']]

Unnamed: 0,sepal_width,petal_length
140,3.1,5.6
49,3.3,1.4


### 마스킹을 이용한 다중 조건

In [196]:
mask1 = iris['petal_length'] > 1.7
mask2 = iris['sepal_width'] < 3.0
mask = mask1 & mask2
iris.loc[mask][-5:]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
132,6.4,2.8,5.6,2.2,virginica
133,6.3,2.8,5.1,1.5,virginica
134,6.1,2.6,5.6,1.4,virginica
142,5.8,2.7,5.1,1.9,virginica
146,6.3,2.5,5.0,1.9,virginica


In [197]:
mask = mask1 | mask2
iris.loc[mask][-5:]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [198]:
iris.loc[~mask][-5:]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
45,4.8,3.0,1.4,0.3,setosa
46,5.1,3.8,1.6,0.2,setosa
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa


In [365]:
# mask1 = iris['petal-length'] < 2.5 & iris['sepal_width'] < 2.5
mask1 = (iris[['petal_length', 'sepal_width']] < 2.5).all(axis=1)
mask2 = iris['sepal_length'] > 4.0

mask = mask1 & mask2
iris.loc[mask]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
41,4.5,2.3,1.3,0.3,setosa


### 데이터 형식에 기반한 열 선택

In [220]:
iris.select_dtypes(include=np.object_)

Unnamed: 0,species
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa
...,...
145,virginica
146,virginica
147,virginica
148,virginica


In [221]:
iris.select_dtypes(include=np.float64)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [223]:
iris.select_dtypes(exclude="number")

Unnamed: 0,species
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa
...,...
145,virginica
146,virginica
147,virginica
148,virginica
