# 판다스 (pandas)

In [3]:
# !pip install pandas

In [4]:
import numpy as np
import pandas as pd

In [5]:
# 판다스 Series
data1 = np.arange(1,6)
print(data1)
data2 = pd.Series(data1)
print(data2)
data2.index , data2.values

[1 2 3 4 5]
0    1
1    2
2    3
3    4
4    5
dtype: int64


(RangeIndex(start=0, stop=5, step=1), array([1, 2, 3, 4, 5]))

In [6]:
data3 = pd.Series([11,22,33,44,55])
data3

0    11
1    22
2    33
3    44
4    55
dtype: int64

In [7]:
dd = {"apple":4400,"banana":3500,"kiwi":2200,"orange":1700,"mango":8800}
data4 = pd.Series(dd)
data4

apple     4400
banana    3500
kiwi      2200
orange    1700
mango     8800
dtype: int64

In [8]:
print(type(data4))
data4.name = "과일가격표"
data4.index.name = "과일이름"
data4 , data4.index , data4.values

<class 'pandas.core.series.Series'>


(과일이름
 apple     4400
 banana    3500
 kiwi      2200
 orange    1700
 mango     8800
 Name: 과일가격표, dtype: int64,
 Index(['apple', 'banana', 'kiwi', 'orange', 'mango'], dtype='object', name='과일이름'),
 array([4400, 3500, 2200, 1700, 8800]))

In [9]:
data2.index = ["aa","bb","cc","dd","ee"] # 개수가 같아야 함
data2 , data2.index, data2.values

(aa    1
 bb    2
 cc    3
 dd    4
 ee    5
 dtype: int64,
 Index(['aa', 'bb', 'cc', 'dd', 'ee'], dtype='object'),
 array([1, 2, 3, 4, 5]))

In [10]:
# 판다스 시리즈 데이터 타입 설정, 인덱스 수정
data5 = pd.Series(np.arange(1,7), dtype="float", index=["aa","bb","cc","dd","ee","ff"] )
data5 , data5.shape, data5.size, data5.ndim

(aa    1.0
 bb    2.0
 cc    3.0
 dd    4.0
 ee    5.0
 ff    6.0
 dtype: float64,
 (6,),
 6,
 1)

In [11]:
# 시리즈 원소에 접근방법
print(data5)
print("data5['dd']:",data5['dd']) 
print("data5.loc['dd']:",data5.loc['dd'])
print("data5[3]:",data5[3]) # deprecated
print("data5.iloc[3]:",data5.iloc[3])

aa    1.0
bb    2.0
cc    3.0
dd    4.0
ee    5.0
ff    6.0
dtype: float64
data5['dd']: 4.0
data5.loc['dd']: 4.0
data5[3]: 4.0
data5.iloc[3]: 4.0


  print("data5[3]:",data5[3]) # deprecated


## 데이터프레임 Dataframe

In [12]:
import pandas as pd
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]],index=["a","b","c"],columns=["A","B","C"])
df

Unnamed: 0,A,B,C
a,1,2,3
b,4,5,6
c,7,8,9


In [13]:
num = np.arange(1,10).reshape(3,3)
df = pd.DataFrame(num,index=["a","b","c"],columns=["A","B","C"])
df

Unnamed: 0,A,B,C
a,1,2,3
b,4,5,6
c,7,8,9


In [14]:
dd = {"과일이름":["apple","banana","kiwi","melon","orange","mango"],
      "가격":[4400,5000,2500,7000,2000,8800],
      "개수":[3,8,11,22,6,5]}
df2 = pd.DataFrame(dd)
df2.index = range(1,7)
df2.index.name = "No."
df2 , df2.columns, df2.values

(       과일이름    가격  개수
 No.                  
 1     apple  4400   3
 2    banana  5000   8
 3      kiwi  2500  11
 4     melon  7000  22
 5    orange  2000   6
 6     mango  8800   5,
 Index(['과일이름', '가격', '개수'], dtype='object'),
 array([['apple', 4400, 3],
        ['banana', 5000, 8],
        ['kiwi', 2500, 11],
        ['melon', 7000, 22],
        ['orange', 2000, 6],
        ['mango', 8800, 5]], dtype=object))

In [15]:
df2

Unnamed: 0_level_0,과일이름,가격,개수
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,apple,4400,3
2,banana,5000,8
3,kiwi,2500,11
4,melon,7000,22
5,orange,2000,6
6,mango,8800,5


In [16]:
df2.columns.name = "info"
df2.index.name =''
df2.columns = ['과일이름','원가','수량',]
df2

Unnamed: 0,과일이름,원가,수량
,,,
1.0,apple,4400.0,3.0
2.0,banana,5000.0,8.0
3.0,kiwi,2500.0,11.0
4.0,melon,7000.0,22.0
5.0,orange,2000.0,6.0
6.0,mango,8800.0,5.0


In [17]:
df2.describe()

Unnamed: 0,원가,수량
count,6.0,6.0
mean,4950.0,9.166667
std,2609.022806,6.853223
min,2000.0,3.0
25%,2975.0,5.25
50%,4700.0,7.0
75%,6500.0,10.25
max,8800.0,22.0


In [18]:
df3 = pd.DataFrame(df2, columns = ['과일이름','품종','원가','할인','수량','합계'])
df3 , df3.dtypes

(     과일이름  품종    원가  할인  수량  합계
                                
 1   apple NaN  4400 NaN   3 NaN
 2  banana NaN  5000 NaN   8 NaN
 3    kiwi NaN  2500 NaN  11 NaN
 4   melon NaN  7000 NaN  22 NaN
 5  orange NaN  2000 NaN   6 NaN
 6   mango NaN  8800 NaN   5 NaN,
 과일이름     object
 품종      float64
 원가        int64
 할인      float64
 수량        int64
 합계      float64
 dtype: object)

In [19]:
df3["품종"] = df3["품종"].astype('object')
# df3.loc[:,'품종'] = df3.loc[:,'품종'].astype('object')
print(df3.dtypes)
df3.loc[:,'품종'] = ['신선한','궁금한','맛있는','최적의','웰빙','유기농']
df3.loc[:,'할인'] = [0.2,0.3,0.2,0.1,0.2,0.3]
df3

과일이름     object
품종       object
원가        int64
할인      float64
수량        int64
합계      float64
dtype: object


Unnamed: 0,과일이름,품종,원가,할인,수량,합계
,,,,,,
1.0,apple,신선한,4400.0,0.2,3.0,
2.0,banana,궁금한,5000.0,0.3,8.0,
3.0,kiwi,맛있는,2500.0,0.2,11.0,
4.0,melon,최적의,7000.0,0.1,22.0,
5.0,orange,웰빙,2000.0,0.2,6.0,
6.0,mango,유기농,8800.0,0.3,5.0,


In [20]:
df3['합계'] = (df3["원가"] * df3["수량"] * (1 - df3["할인"])).astype('int')
df3

Unnamed: 0,과일이름,품종,원가,할인,수량,합계
,,,,,,
1.0,apple,신선한,4400.0,0.2,3.0,10560.0
2.0,banana,궁금한,5000.0,0.3,8.0,28000.0
3.0,kiwi,맛있는,2500.0,0.2,11.0,22000.0
4.0,melon,최적의,7000.0,0.1,22.0,138600.0
5.0,orange,웰빙,2000.0,0.2,6.0,9600.0
6.0,mango,유기농,8800.0,0.3,5.0,30799.0


In [21]:
df3['ETC'] = "-"
df3['Test'] = "test"
print(df3.dtypes)
df3 

과일이름     object
품종       object
원가        int64
할인      float64
수량        int64
합계        int64
ETC      object
Test     object
dtype: object


Unnamed: 0,과일이름,품종,원가,할인,수량,합계,ETC,Test
,,,,,,,,
1.0,apple,신선한,4400.0,0.2,3.0,10560.0,-,test
2.0,banana,궁금한,5000.0,0.3,8.0,28000.0,-,test
3.0,kiwi,맛있는,2500.0,0.2,11.0,22000.0,-,test
4.0,melon,최적의,7000.0,0.1,22.0,138600.0,-,test
5.0,orange,웰빙,2000.0,0.2,6.0,9600.0,-,test
6.0,mango,유기농,8800.0,0.3,5.0,30799.0,-,test


In [22]:
del df3['Test']
df3

Unnamed: 0,과일이름,품종,원가,할인,수량,합계,ETC
,,,,,,,
1.0,apple,신선한,4400.0,0.2,3.0,10560.0,-
2.0,banana,궁금한,5000.0,0.3,8.0,28000.0,-
3.0,kiwi,맛있는,2500.0,0.2,11.0,22000.0,-
4.0,melon,최적의,7000.0,0.1,22.0,138600.0,-
5.0,orange,웰빙,2000.0,0.2,6.0,9600.0,-
6.0,mango,유기농,8800.0,0.3,5.0,30799.0,-


In [23]:
ee = pd.Series([1.3,2.2], index=[2,4])
df3['ETC'] = ee
df3 = df3.fillna('')
df3

Unnamed: 0,과일이름,품종,원가,할인,수량,합계,ETC
,,,,,,,
1.0,apple,신선한,4400.0,0.2,3.0,10560.0,
2.0,banana,궁금한,5000.0,0.3,8.0,28000.0,1.3
3.0,kiwi,맛있는,2500.0,0.2,11.0,22000.0,
4.0,melon,최적의,7000.0,0.1,22.0,138600.0,2.2
5.0,orange,웰빙,2000.0,0.2,6.0,9600.0,
6.0,mango,유기농,8800.0,0.3,5.0,30799.0,


In [24]:
df3['PASS'] = ''
df3

Unnamed: 0,과일이름,품종,원가,할인,수량,합계,ETC,PASS
,,,,,,,,
1.0,apple,신선한,4400.0,0.2,3.0,10560.0,,
2.0,banana,궁금한,5000.0,0.3,8.0,28000.0,1.3,
3.0,kiwi,맛있는,2500.0,0.2,11.0,22000.0,,
4.0,melon,최적의,7000.0,0.1,22.0,138600.0,2.2,
5.0,orange,웰빙,2000.0,0.2,6.0,9600.0,,
6.0,mango,유기농,8800.0,0.3,5.0,30799.0,,


In [25]:
pp = np.where(df3['합계']<=30000,'구매각','-')
df3['PASS'] = pp
df3

Unnamed: 0,과일이름,품종,원가,할인,수량,합계,ETC,PASS
,,,,,,,,
1.0,apple,신선한,4400.0,0.2,3.0,10560.0,,구매각
2.0,banana,궁금한,5000.0,0.3,8.0,28000.0,1.3,구매각
3.0,kiwi,맛있는,2500.0,0.2,11.0,22000.0,,구매각
4.0,melon,최적의,7000.0,0.1,22.0,138600.0,2.2,-
5.0,orange,웰빙,2000.0,0.2,6.0,9600.0,,구매각
6.0,mango,유기농,8800.0,0.3,5.0,30799.0,,-


In [26]:
df3['DV']=np.where(df3['수량'] >7,'포장','번들')
df3

Unnamed: 0,과일이름,품종,원가,할인,수량,합계,ETC,PASS,DV
,,,,,,,,,
1.0,apple,신선한,4400.0,0.2,3.0,10560.0,,구매각,번들
2.0,banana,궁금한,5000.0,0.3,8.0,28000.0,1.3,구매각,포장
3.0,kiwi,맛있는,2500.0,0.2,11.0,22000.0,,구매각,포장
4.0,melon,최적의,7000.0,0.1,22.0,138600.0,2.2,-,포장
5.0,orange,웰빙,2000.0,0.2,6.0,9600.0,,구매각,번들
6.0,mango,유기농,8800.0,0.3,5.0,30799.0,,-,번들


In [39]:
# df3.iloc[1,1:4]
# df3.loc[2,:]
df3.loc[df3.loc[:,'과일이름']=='banana','품종':'수량']

Unnamed: 0,품종,원가,할인,수량
,,,,
2.0,궁금한,5000.0,0.3,8.0


In [44]:
df3.loc[df3.loc[:,'PASS']=='구매각',['과일이름','품종','원가','PASS']]

Unnamed: 0,과일이름,품종,원가,PASS
,,,,
1.0,apple,신선한,4400.0,구매각
2.0,banana,궁금한,5000.0,구매각
3.0,kiwi,맛있는,2500.0,구매각
5.0,orange,웰빙,2000.0,구매각


In [68]:
# df3.iloc[1:4,0:4]
# df3.loc['1':'5','과일이름':'할인']
# df3.iloc[[0,2,4],[0,2,5,7]]
df3.loc[[1,3,5],['과일이름','원가','합계','PASS']]

Unnamed: 0,과일이름,원가,합계,PASS
,,,,
1.0,apple,4400.0,10560.0,구매각
3.0,kiwi,2500.0,22000.0,구매각
5.0,orange,2000.0,9600.0,구매각


In [87]:
import numpy as np
import pandas as pd
print("인구수 시리즈데이터")
population_dic={
    'korea':5182,
    'japan':12622,
    'china':141178,
    'usa':32976
}
print("GDP 시리즈데이터")
GDP_dic = {
    'korea':169320000,
    'japan':516700000,
    'china':140925000,
    'usa':2041280000
}
population = pd.Series(population_dic)
GDP = pd.Series(GDP_dic)
print(population)
print(GDP)
print("="*30)
print("데이터 프레임 생성")
country = pd.DataFrame({"population":population,"GDP":GDP})
country


인구수 시리즈데이터
GDP 시리즈데이터
korea      5182
japan     12622
china    141178
usa       32976
dtype: int64
korea     169320000
japan     516700000
china     140925000
usa      2041280000
dtype: int64
데이터 프레임 생성


Unnamed: 0,population,GDP
korea,5182,169320000
japan,12622,516700000
china,141178,140925000
usa,32976,2041280000


In [88]:
country.loc[['korea'],:]

Unnamed: 0,population,GDP
korea,5182,169320000


In [89]:
country.loc['india'] = [142222, 150505000]
country

Unnamed: 0,population,GDP
korea,5182,169320000
japan,12622,516700000
china,141178,140925000
usa,32976,2041280000
india,142222,150505000


In [90]:
country['GDP_per'] = country['GDP'] / country["population"]
country

Unnamed: 0,population,GDP,GDP_per
korea,5182,169320000,32674.642995
japan,12622,516700000,40936.460149
china,141178,140925000,998.207936
usa,32976,2041280000,61901.989326
india,142222,150505000,1058.239935


In [107]:
country.loc['africa'] = pd.Series([123456,7777777], index=["population","GDP"])
country

Unnamed: 0,population,GDP,GDP_per
korea,5182.0,169320000.0,32674.642995
japan,12622.0,516700000.0,40936.460149
china,141178.0,140925000.0,998.207936
usa,32976.0,2041280000.0,61901.989326
india,142222.0,150505000.0,1058.239935
africa,123456.0,7777777.0,


In [124]:
country = country.drop('africa', axis=0)
# country.drop(country[country.isnull().any(axis=1)].index, axis=0)
country

Unnamed: 0,population,GDP,GDP_per
korea,5182.0,169320000.0,32674.642995
japan,12622.0,516700000.0,40936.460149
china,141178.0,140925000.0,998.207936
usa,32976.0,2041280000.0,61901.989326
india,142222.0,150505000.0,1058.239935
