In [1]:
import pandas as pd
import numpy as np

In [5]:
frame = pd.DataFrame(np.arange(12).reshape(4,3),
                    columns=list('bde'),
                    index =['Utah','Ohio','Texas','Oregon']
                    )
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [6]:
series = pd.Series(range(3), index=['b','e','f'])
series

b    0
e    1
f    2
dtype: int64

In [7]:
series + frame

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [8]:
series3 = frame['d']
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [12]:
frame[['d','e']]
series3

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: int32

In [11]:
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregon,-1,0,1


In [13]:
frame = pd.DataFrame(np.random.randn(4,3),
                    columns=list('bde'),
                    index =['Utah','Ohio','Texas','Oregon']
                    )
frame

Unnamed: 0,b,d,e
Utah,-2.764401,0.369255,0.050575
Ohio,-1.171207,-0.107625,-0.482804
Texas,-0.775697,2.817052,-1.191876
Oregon,0.815154,0.019845,-0.41492


In [16]:
f = lambda x: x.max() - x.min()
frame.apply(f,axis = 1)

Utah      3.133655
Ohio      1.063581
Texas     4.008928
Oregon    1.230074
dtype: float64

In [22]:
def f(x):
    return pd.Series([x.min(), x.max(),x.mean(), x.median()], index=['min','max','mean','median'])
frame.apply(f,axis = 0)

Unnamed: 0,b,d,e
min,-2.764401,-0.107625,-1.191876
max,0.815154,2.817052,0.050575
mean,-0.974037,0.774632,-0.509756
median,-0.973452,0.19455,-0.448862


In [23]:
format = lambda x: '%.2f' % x
frame.applymap(format) #원소 하나하나가 아닌 모두에 적용할때 사용함.

Unnamed: 0,b,d,e
Utah,-2.76,0.37,0.05
Ohio,-1.17,-0.11,-0.48
Texas,-0.78,2.82,-1.19
Oregon,0.82,0.02,-0.41


In [24]:
obj = pd.Series(range(4), index=['d','a','b','c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [25]:
frame = pd.DataFrame(np.arange(8).reshape(2,4),
                    columns=list('dabc'),
                    index =['one','three']
                    )
frame.sort_index()

Unnamed: 0,d,a,b,c
one,0,1,2,3
three,4,5,6,7


In [28]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
one,0,3,2,1
three,4,7,6,5


In [36]:
Tframe = frame.T
Tframe.sort_values(by=['one','three'])#one을 기준으로 sort후 three sort함

Unnamed: 0,one,three
d,0,4
a,1,5
b,2,6
c,3,7


In [39]:
obj = pd.Series([7,-5,7,4,2,0,4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [40]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [41]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [45]:
obj.index.is_unique

True

In [52]:
df = pd.DataFrame([[1.4,np.nan], [7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
                 index = ['a','b','c','d'], columns=['one', 'two'])
df.sum() #Nan 값은 제거후 더해서 반환함
df.sum(axis='columns')# column을 축으로 (가로로 더해서) 반환함
df.mean(axis='columns', skipna=False) #skipna는 Nan을 skip하지 않고 그냥 Nan반환케함


a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [55]:
df.idxmax() # 최댓값을 가지고 있는 idx반환함
df.cumsum() # 누산

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [56]:
df.describe() #수치형 데이터가 아니면 describe는 count,unique,top등을 반환함

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [57]:
df.info() #성질이 나옴

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     3 non-null      float64
 1   two     2 non-null      float64
dtypes: float64(2)
memory usage: 268.0+ bytes


In [63]:
import pandas_datareader.data as web

In [65]:
all_data={ticker:web.get_data_yahoo(ticker)
         for ticker in ['AAPL','IBM','MSFT','GOOG']}

In [82]:
price = pd.DataFrame({ticker:data['Adj Close']
                     for ticker, data in all_data.items()})
returns = price.pct_change()
returns.tail()


Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-14,0.0241,-0.003279,0.005445,0.008306
2021-07-15,-0.004492,0.004506,-0.005239,-0.006178
2021-07-16,-0.014076,-0.011036,-0.000996,0.004411
2021-07-19,-0.026914,-0.007055,-0.013321,-0.019656
2021-07-20,0.025974,0.014864,0.008339,0.014294


In [78]:
price.describe()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
count,1258.0,1258.0,1258.0,1258.0
mean,62.122833,125.84515,128.748895,1254.741012
std,34.317092,10.10824,62.673004,421.490083
min,22.576918,88.795891,51.4062,736.080017
25%,37.918314,120.54784,73.777445,983.872498
50%,47.612127,125.636597,107.961296,1146.099976
75%,78.222795,130.729427,177.502026,1416.349976
max,149.149994,151.279999,282.51001,2641.649902


In [74]:
price.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1258 entries, 2016-07-21 to 2021-07-20
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    1258 non-null   float64
 1   IBM     1258 non-null   float64
 2   MSFT    1258 non-null   float64
 3   GOOG    1258 non-null   float64
dtypes: float64(4)
memory usage: 49.1 KB


In [80]:
def f(x):
    return pd.Series([x.min(), x.max(),x.mean(), x.median()], index=['min','max','mean','median'])
price.apply(f,axis = 0)

Unnamed: 0,AAPL,IBM,MSFT,GOOG
min,22.576918,88.795891,51.4062,736.080017
max,149.149994,151.279999,282.51001,2641.649902
mean,62.122833,125.84515,128.748895,1254.741012
median,47.612127,125.636597,107.961296,1146.099976


In [84]:
returns['MSFT'].corr(returns['IBM'])

0.5182176565553225

In [89]:
#DataFrame.corr() : 데이터프레임의 모든 영사이 상관관계
#DataFrame.corrwith(열B): 데이터 프레임의 모든 열과 열B의 상관관계
returns.corr() 



1258

In [90]:
obj = pd.Series(['c','a','d','a','a','b','b','c','c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [93]:
mask = obj.isin(['b','c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [96]:
to_match = pd.Series(['c','a','b','b','c','a'])
unique_vals = pd.Series(['c','b','a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [100]:
data = pd.DataFrame({'Qu1':[1,3,4,3,4],
                    'Qu2':[2,3,1,2,3],
                    'Qu3':[1,5,2,4,4]})
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [5]:
data = pd.DataFrame([[1., 6.5, 3.], [1.,pd.NA,pd.NA],[pd.NA,pd.NA,pd.NA],[pd.NA,6.5,3.0]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [6]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [8]:
df = pd.DataFrame(np.random.randn(6,3))
df

Unnamed: 0,0,1,2
0,-1.430106,-1.497121,0.127192
1,0.019965,0.206787,2.185118
2,-0.052829,0.895593,-0.496855
3,0.126822,0.993147,0.586104
4,-0.587246,-1.133703,0.107127
5,1.559398,0.828572,-0.920504


In [12]:
df.iloc[2:, 1] = pd.NA
df.iloc[4:, 2] = pd.NA
df

Unnamed: 0,0,1,2
0,-1.430106,-1.497121,0.127192
1,0.019965,0.206787,2.185118
2,-0.052829,,-0.496855
3,0.126822,,0.586104
4,-0.587246,,
5,1.559398,,


In [13]:
df.fillna(method='ffill') # 앞에걸로 na값 채우기

Unnamed: 0,0,1,2
0,-1.430106,-1.497121,0.127192
1,0.019965,0.206787,2.185118
2,-0.052829,0.206787,-0.496855
3,0.126822,0.206787,0.586104
4,-0.587246,0.206787,0.586104
5,1.559398,0.206787,0.586104


In [None]:
df.fillna(method='ffill', limit = 2) # 앞에걸로 na값 채우기, 2개까지만 채우고 나머진 놔두기

In [15]:
data = pd.Series([1.,pd.NA,3.5,pd.NA,7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [25]:
data = pd.DataFrame({'k1':['one','two'] * 3 + ['two'],
                    'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [19]:
from numpy import nan as NA

In [28]:
data.duplicated() # 각 로우가 중복인지 아닌지 알려줌  - 만약 괄호안에 값이 있으면 그 열이 중복인지

0    False
1    False
2     True
3     True
4     True
5     True
6     True
dtype: bool

In [21]:
data.drop_duplicates() # duplicated배열이 FALSE인 DataFrame 반환

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [27]:
data['v1'] = range(7)
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [29]:
data.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6
