# Getting Started with pandas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))

In [9]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

## Introduction to pandas Data Structure

### Series

In [None]:
# Series 생성
obj = pd.Series([4, 7, -5, 3], index=['a','b','c','d'])
obj

In [None]:
# Series values 와 index 조회
obj.values, obj.index

In [None]:
# Null 조회
obj.isnull(), obj.notnull()

In [None]:
# Series의 name 과 index name 설정
obj.name = 'test name'
obj.index.name = 'ind'
obj

### DataFrame

In [None]:
# Dict를이용한 DataFrame 생성
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(data)
df

In [None]:
df.head(), df.tail()

In [None]:
# Dict data를 이용하여 DataFrame을 만들고 컬럼의 순서를 재조정
pd.DataFrame(data, columns=['year', 'state', 'pop'])

In [None]:
# Dict 및 List를 이용한 DataFrame 생성
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

df2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
df2

In [None]:
# DataFrame의 column 삭제
# df.copy() 를 명확히 알고 사용하여야 한다
df2['eastern'] = df2.state == 'Ohio'
df2_1 = df2.copy()                   # copy() 사용 여부에 따라 결과값이 달라짐
del df2_1['eastern']
display('df2', 'df2_1')


In [None]:
df2

In [None]:
# Dict를 이용해 컬럼 및 인덱스를 포함하여 DataFrame 생성
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
df3 = pd.DataFrame(pop)
df3

In [None]:
# DataFrame의 index와 columns 조회
df2.index, df2.columns

In [None]:
# DataFrame의 index 이름과 columns 이름 설정
df3.index.name = 'year'
df3.columns.name = 'state'
df3

In [None]:
# DataFrame 행과 열 전환
display("df2", "df2.T")

In [None]:
# DataFramd Data 조회 - 4가지 방법
df2['state'], df2.year, df2.loc['three'], df2.iloc[0]

### Index Object

In [None]:
# Series index 처리
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index
index[1:]

In [None]:
# pd.Index()를 이용한 인덱스 Object 생성
labels = pd.Index(np.arange(3))
print(labels)
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
print(obj2)
obj2.index is labels

In [None]:
# index, columns 활용
print('Ohio' in df3.columns)
print(2003 in df3.index)
df3

In [None]:
# index의 중복 허용
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

### ReIndexing

In [None]:
# Series ReIndexing - index 순서 재조정, 없는경우 NaN
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

In [None]:
# pd.reindex() - method='ffill'
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)
obj3.reindex(range(6), method='ffill')

In [None]:
# DataFrame reindex() - index 재조정
df = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
df2 = df.reindex(['a', 'b', 'c', 'd'])
display('df', 'df2')

In [None]:
# DataFrame reindex() - columns 재조정
states = ['Texas', 'Utah', 'California']
df.reindex(columns=states)

### Dropping Entries from an Axis

In [None]:
# Series Data Drop
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
new_obj.drop(['e', 'd'])

In [None]:
# DataFrame 행(row, index) 삭제
df = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

df1 = df.drop(['Colorado', 'Ohio'])
display('df', 'df1')

In [None]:
# DataFrame 열(columns) 삭제
df1 = df.drop('two', axis=1)
display('df', 'df1')

In [None]:
df1 = df.drop(['two', 'four'], axis='columns')
display('df', 'df1')

In [None]:
# 원본 DataFrame 을 직접 변경 drop(, inplace=True)
df1 = df.copy()
df.drop('one', axis=1)
df1.drop('one', axis=1, inplace=True)
display('df', 'df1')

### Indexing, Selection, and Filtering

In [None]:
# Series 조회
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj['b'])
print(obj[1])
print(obj[2:4])
print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
print(obj[obj < 2])

In [None]:
# DataFrame 조회
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
print(data)
print('~'*40)
print(data['two'])
print('~'*40)
print(data[['three', 'one']])
print('~'*40)
print(data[0:2])
print('~'*40)
print(data[data['three'] > 5])

### Selection with loc and iloc

In [None]:
print(data)
print('~'*40)
print(data.loc['Colorado', ['two', 'three']])
print('~'*40)
print(data.iloc[2, [3, 0, 1]])
print('~'*40)
print(data.iloc[2])
print('~'*40)
print(data.iloc[[1, 2], [3, 0, 1]])


In [None]:
print(data.loc[:'Utah', 'two'])
print('~'*40)
print(data.iloc[:, :3][data.three > 5])

### Integer Indexes

In [None]:
# Label 과 Inter 가 동일한 경우 먼저 Label을 찾도록 설계되어 있음
ser = pd.Series(np.arange(3.))
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])

print(ser[:1])
print('~'*40)
print(ser.loc[:1])   # Label Indexing
print('~'*40)
print(ser.iloc[:1])  # Inter Indexing

### Arithmetic and Data Alignment(산술연산과 데이터 정렬)

In [None]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
print(s1)
print('~'*40)
print(s2)
print('~'*40)
print(s1+s2)


In [None]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df3 = df1 + df2
display('df1', 'df2', 'df3')

In [None]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
df3 = df1 - df2
display('df1','df2','df3')

### Arithmetic methods with fill values : 산술 연산 메서드에 채워 넣을 값 지정하기

In [None]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
display('df1','df2')

In [None]:
df1 + df2

In [None]:
df1.add(df2, fill_value=0)

In [None]:
1/df1

In [None]:
print(df1)
df1.rdiv(1)

In [None]:
print(df1)
print('~'*40)
print(df1.sub(1))
print('~'*40)
print(df1.rsub(1))

In [None]:
df1.reindex(columns=df2.columns, fill_value=0)

### Operations between DataFrame and Series

In [None]:
# Index가 동일한 경우 산술 연산
df = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = df.iloc[0]
df - series

In [None]:
df

In [None]:
series

In [None]:
# Index가 다른 경우 산술 연산
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
df + series2

In [None]:
series3 = df['d']
series3
df.sub(series3, axis='index')

In [None]:
df

In [None]:
series3

In [None]:

df.sub(series3)

In [None]:
# 무지 헤깔리는 axis
df.sub(series3, axis='index')

### Function Application and Mapping

In [None]:
df = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df

In [None]:
# np.abs : 절대값
np.abs(df)

In [None]:
f = lambda x: x.max() - x.min()
df.apply(f)

In [None]:
df.apply(f, axis='columns')

In [None]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

df_1 = df.apply(f)
df_2 = df.apply(f, axis=1)
display('df','df_1','df_2')

In [None]:
df

In [None]:
# 요소별 함수의 적용
# applymap() - DataFrame 사용, map() - Series에서 사용
format = lambda x: '%.2f' % x
df_1 = df.applymap(format)
display('df','df_1')

In [None]:
df['e'].map(format)

### Sorting and Ranking

In [None]:
# sort_index() - index를 기준으로 sort
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

In [None]:
df = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
df_1 = df.sort_index()
df_2 = df.sort_index(axis=1)
df_3 = df.sort_index(axis=1, ascending=False)
display('df','df_1','df_2','df_3')

In [None]:
# Series - sort_values()
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

In [None]:
df = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
df_1 = df.sort_values(by='b')
df_2 = df.sort_values(by=['a', 'b'])
display('df','df_1','df_2')



### Ranking
**rank()** - ***특정변수를 기준으로 순위를 구함***

#### 동점 처리 방법(tie-breaking methods)
1. 평균(method='average'), default
2. 최소값(method='min')
3. 최대값(method='max')
4. 첫번째값(method='first')
5. 조밀하게(method='dense')

In [None]:
# rank() - 특정변수를 기준으로 순위를 구함
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

In [None]:
obj.rank(method='min')

In [None]:
# 높은 값일 수록 1순위 - rank()-ascending
obj.rank(ascending=False, method='min')

In [None]:
df = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
df_1 = df.rank(method='first')
df_2 = df.rank(method='first', axis='columns')
display('df','df_1','df_2')

### Axis Indexes with Duplicate Labels

In [None]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
print(obj.index.is_unique)
print('~'*40)
print(obj['a'])
print('~'*40)
print(obj['c'])

In [None]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df.loc['b']

## Summarizing and Computing Descriptive Statistics

In [None]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
print(df)
print('~'*40)
print(df.sum())
print('~'*40)
print(df.sum(axis='columns'))
print('~'*40)

# 최대값을 가지는 index 출력
# Series는 index, DataFrame은 index, columns
print(df.idxmax())
print('~'*40)
print(df.cumsum())

In [None]:
## Correlation and Covariance

In [None]:
price = pd.read_pickle('examples/yahoo_price.pkl')
volume = pd.read_pickle('examples/yahoo_volume.pkl')

In [None]:
# 수익률 계산
# return(%) = (현재가격 - 어제가격)/어제가격 * 100
returns = price.pct_change()

In [None]:
# pd.corr() - 데이터간 상관관계
# method = 'pearson'
#        = 'kendall'
#        = 'spearman'
returns['MSFT'].corr(returns['IBM'])

In [None]:
returns['MSFT'].cov(returns['IBM'])

In [None]:
import seaborn as sns 
returns.corr()

In [None]:
# 공분산(Covariance)
# 2개의 확률변수의 상관정도를 나타내는 값
returns['MSFT'].cov(returns['IBM'])


In [None]:
# 지정한 변수와 모든 변수간 상관계수
returns.corrwith(returns.IBM)

In [None]:
returns.corrwith(volume)

In [None]:
returns

In [None]:
volume

### Unique Values, Value Counts, and Membership

In [None]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj.unique())
print(obj.value_counts())

In [None]:
obj
mask = obj.isin(['b', 'c'])
mask
obj[mask]

In [None]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

In [None]:
# DataFrame 에서 value_counts()
df = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
df_1 = df.apply(pd.value_counts).fillna(0)
display('df','df_1')                     

# Data Loading, Storage and File Formats

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

## Reading and Writing Data in Text Format

In [10]:
# Read CSV file to Dataframe
df = pd.read_csv('examples/ex1.csv')
df_1 = pd.read_table('examples/ex1.csv', sep=',')
display('df','df_1')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [12]:
df_1 = pd.read_csv('examples/ex2.csv', header=None)
df_2 = pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd','message'])
display('df_1','df_2')

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [13]:
# 컬럼명을 지정하고 한개 컬럼을 index로 지정
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('examples/ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [15]:
# 2의 index 설정
df = pd.read_csv('examples/csv_mindex.csv',
                index_col=['key1', 'key2'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [20]:
list(open('examples/ex3.txt'))

['            A         B         C\n',
 '\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 '\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 '\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 '\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [21]:
# 확장자가 csv가 아닌 파일 즉, 데이터를 구분하는 구분자가 쉼표가 아니면 sep 인수를 써서 구분자를 지정
# 구분자의 길이가 정해지지 않은 공백인 경우 \s+ 정규식 문자열을 사용

pd.read_table('examples/ex3.txt', sep='\s+')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [25]:
# 자료 파일중에 건너뛰어야 할 행이 있다면 skiprows 인수 사용
pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [54]:
result = pd.read_csv('examples/ex5.csv')
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [55]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [56]:
# NaN 값의 지정
# Pandas가 기본으로 인정하는 NaN : NaN, NULL, NA
result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [57]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


### Reading Text in Pieces

In [58]:
# pandas options - 출력 max 갯수
pd.options.display.max_rows = 10

In [None]:
df = pd.read_csv('examples/ex6.csv')
df_1 = pd.read_csv('examples/ex6.csv', nrows=5) # Loading Data 갯수 제한
df_2 = pd.read_csv('examples/ex6.csv', chunksize=1000)
display('df',' df_1', 'df_2')

In [65]:
df_2 = pd.read_csv('examples/ex6.csv', chunksize=1000)
df_2

<pandas.io.parsers.TextFileReader at 0x1f96b6b9100>