### 자료구조: Series와 Dataframe
Pandas에서 제공하는 데이터 자료구조는 Series와 Dataframe 두가지가 존재하는데 Series는 시계열과 유사한 데이터로서 index와 value가 존재하고 Dataframe은 딕셔너리데이터를 매트릭스 형태로 만들어 준 것 같은 frame을 가지고 있다. 이런 데이터 구조를 통해 시계열, 비시계열 데이터를 통합하여 다룰 수 있다.

### Series

In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [9]:
fruit = Series([2500,3800,1200,6000], index=['apple','banana','pear','cherry'])

In [10]:
fruit

apple     2500
banana    3800
pear      1200
cherry    6000
dtype: int64

In [11]:
print(fruit.values)

[2500 3800 1200 6000]


In [12]:
print(fruit.index)

Index(['apple', 'banana', 'pear', 'cherry'], dtype='object')


In [13]:
fruitData={'apple':2500, 'banana':3800, 'pear':1200, 'cherry':6000}
fruit = Series(fruitData)

In [15]:
print(type(fruitData))  # print 써줘서 실행해 줘야 둘다 나옴.
print(type(fruit))

<class 'dict'>
<class 'pandas.core.series.Series'>


In [16]:
fruit

apple     2500
banana    3800
pear      1200
cherry    6000
dtype: int64

In [19]:
fruit.name='fruitPrice'
fruit.index.name ='fruitName'

In [20]:
fruit

fruitName
apple     2500
banana    3800
pear      1200
cherry    6000
Name: fruitPrice, dtype: int64

In [21]:
fruitData = {'fruitName':['apple','banana','cherry','pear'],
              'fruitPrice':[2500,3800,6000,1200],
              'num':[10,5,3,8]
            }              #fruitName이 key, 옆의 리스트가 하나의 value가 됨

In [22]:
fruitFrame = DataFrame(fruitData)

In [24]:
fruitFrame   #print 문을 빼니까 표로 더 예쁘게 나옴

Unnamed: 0,fruitName,fruitPrice,num
0,apple,2500,10
1,banana,3800,5
2,cherry,6000,3
3,pear,1200,8


In [25]:
fruitFrame=DataFrame(fruitData, columns = ['fruitPrice','num','fruitName'])   #순서바꾸기 가능

In [26]:
fruitFrame

Unnamed: 0,fruitPrice,num,fruitName
0,2500,10,apple
1,3800,5,banana
2,6000,3,cherry
3,1200,8,pear


In [27]:
fruitFrame['fruitName']

0     apple
1    banana
2    cherry
3      pear
Name: fruitName, dtype: object

In [28]:
fruitFrame.fruitName

0     apple
1    banana
2    cherry
3      pear
Name: fruitName, dtype: object

In [33]:
fruitFrame['Year'] = '2022'   # 새로운 컬럼 추가 

In [34]:
fruitFrame

Unnamed: 0,fruitPrice,num,fruitName,Year
0,2500,10,apple,2022
1,3800,5,banana,2022
2,6000,3,cherry,2022
3,1200,8,pear,2022


In [36]:
variable = Series([4,2,1],index=[0,2,3])   #인덱스를 지정한 후 값 넣기
print(variable)

0    4
2    2
3    1
dtype: int64


In [37]:
fruitFrame['stock']=variable

In [38]:
fruitFrame       # 지정한 0,2,3인덱스의 값만 들어감!

Unnamed: 0,fruitPrice,num,fruitName,Year,stock
0,2500,10,apple,2022,4.0
1,3800,5,banana,2022,
2,6000,3,cherry,2022,2.0
3,1200,8,pear,2022,1.0


In [40]:
fruit = Series([2500,3800,1200,6000],index=['apple','banana','pear','cherry'])
fruit

apple     2500
banana    3800
pear      1200
cherry    6000
dtype: int64

In [41]:
fruit.drop('banana')  #삭제 명령 drop, 원본은 삭제 못함. 삭제된 사본을 돌려준다고 보면 됨.

apple     2500
pear      1200
cherry    6000
dtype: int64

In [42]:
fruit

apple     2500
banana    3800
pear      1200
cherry    6000
dtype: int64

In [43]:
new_fruit = fruit.drop('banana')
new_fruit

apple     2500
pear      1200
cherry    6000
dtype: int64

In [44]:
fruitData

{'fruitName': ['apple', 'banana', 'cherry', 'pear'],
 'fruitPrice': [2500, 3800, 6000, 1200],
 'num': [10, 5, 3, 8]}

In [45]:
fruitName = fruitData['fruitName']
fruitName

['apple', 'banana', 'cherry', 'pear']

In [46]:
fruitFrame = DataFrame(fruitData,
                      index =fruitName,
                      columns=['fruitPrice','num'])  #인덱스를 과일이름으로 지정하겠다.
fruitFrame

Unnamed: 0,fruitPrice,num
apple,2500,10
banana,3800,5
cherry,6000,3
pear,1200,8


In [48]:
fruitFrame2 = fruitFrame.drop(['apple','cherry'])  #지정한 인덱스명으로 삭제
fruitFrame2

Unnamed: 0,fruitPrice,num
banana,3800,5
pear,1200,8


In [49]:
fruitFrame3 = fruitFrame.drop('num', axis=1)  # 세로축은 '1'
fruitFrame3

Unnamed: 0,fruitPrice
apple,2500
banana,3800
cherry,6000
pear,1200


In [50]:
fruit

apple     2500
banana    3800
pear      1200
cherry    6000
dtype: int64

In [51]:
fruit['apple':'pear'] #인덱스 사용 부르기

apple     2500
banana    3800
pear      1200
dtype: int64

In [53]:
fruit[0:1]  #사용x

apple    2500
dtype: int64

In [55]:
fruitFrame['apple':'banana']  # [시작, 끝, step] 으로 슬라이싱

Unnamed: 0,fruitPrice,num
apple,2500,10
banana,3800,5


In [56]:
fruit1 = Series([5,9,10,3], index = ['apple','banana','cherry','pear'])
fruit2 = Series([3,2,9,5,10], index = ['apple','orange','banana','cherry','mango'])

In [57]:
fruit1 + fruit2   #같은 인덱스는 계산 되고, 한쪽에만 인덱스가 있는 것은 Null 값이 뜸. 양쪽에 있어야 계산됨.

apple      8.0
banana    18.0
cherry    15.0
mango      NaN
orange     NaN
pear       NaN
dtype: float64

In [60]:
fruitData1 = {'Ohio' : [4,8,3,5],'Texas' : [0,1,2,3]}
fruitFrame1 = DataFrame(fruitData1,columns=['Ohio','Texas'],index = ['apple','banana','cherry','pear'])
fruitData2 = {'Ohio' : [3,0,2,1,7],'Colorado':[5,4,3,6,0]}
fruitFrame2 = DataFrame(fruitData2,columns =['Ohio','Colorado'],index = ['apple','orange','banana','cherry','mango'])

In [61]:
fruitFrame1

Unnamed: 0,Ohio,Texas
apple,4,0
banana,8,1
cherry,3,2
pear,5,3


In [62]:
fruitFrame2

Unnamed: 0,Ohio,Colorado
apple,3,5
orange,0,4
banana,2,3
cherry,1,6
mango,7,0


In [63]:
fruitFrame1 + fruitFrame2

Unnamed: 0,Colorado,Ohio,Texas
apple,,7.0,
banana,,10.0,
cherry,,4.0,
mango,,,
orange,,,
pear,,,


In [64]:
fruit

apple     2500
banana    3800
pear      1200
cherry    6000
dtype: int64

In [66]:
fruit.sort_values(ascending=False)  #기본()은 오름차순, False값 주면 내림차순

cherry    6000
banana    3800
apple     2500
pear      1200
dtype: int64

In [67]:
fruit.sort_index()

apple     2500
banana    3800
cherry    6000
pear      1200
dtype: int64

In [69]:
fruitFrame.sort_index()

Unnamed: 0,fruitPrice,num
apple,2500,10
banana,3800,5
cherry,6000,3
pear,1200,8


In [71]:
fruitFrame.sort_values(by=['fruitPrice','num'])  #정렬기준 지정

Unnamed: 0,fruitPrice,num
pear,1200,8
apple,2500,10
banana,3800,5
cherry,6000,3


In [111]:
german=pd.read_csv('http://freakonometrics.free.fr/german_credit.csv') # 로컬 or 웹상에 있는 파일을 바로 땡겨올 수 있음. pd.~~
german

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
0,1,1,18,4,2,1049,1,2,4,2,...,4,2,21,3,1,1,3,1,1,1
1,1,1,9,4,0,2799,1,3,2,3,...,2,1,36,3,1,2,3,2,1,1
2,1,2,12,2,9,841,2,4,2,2,...,4,1,23,3,1,1,2,1,1,1
3,1,1,12,4,0,2122,1,3,3,3,...,2,1,39,3,1,2,2,2,1,2
4,1,1,12,4,0,2171,1,3,4,3,...,4,2,38,1,2,2,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,1,24,2,3,1987,1,3,2,3,...,4,1,21,3,1,1,2,2,1,1
996,0,1,24,2,0,2303,1,5,4,3,...,1,1,45,3,2,1,3,1,1,1
997,0,4,21,4,0,12680,5,5,4,3,...,4,4,30,3,3,1,4,1,2,1
998,0,2,12,2,3,6468,5,1,2,3,...,1,4,52,3,2,1,4,1,2,1


In [76]:
list(german.columns.values)  # 컬럼 이름만 뽑아내기 (넘파이 배열을 리스트 객체로 받아내기)

['Creditability',
 'Account Balance',
 'Duration of Credit (month)',
 'Payment Status of Previous Credit',
 'Purpose',
 'Credit Amount',
 'Value Savings/Stocks',
 'Length of current employment',
 'Instalment per cent',
 'Sex & Marital Status',
 'Guarantors',
 'Duration in Current address',
 'Most valuable available asset',
 'Age (years)',
 'Concurrent Credits',
 'Type of apartment',
 'No of Credits at this Bank',
 'Occupation',
 'No of dependents',
 'Telephone',
 'Foreign Worker']

In [81]:
german_sample = german[['Creditability','Duration of Credit (month)', 'Purpose', 'Credit Amount']]
german_sample  #샘플이라는 변수를 만들고, 내가 사용하고자 하는 컬럼만 가져오기

Unnamed: 0,Creditability,Duration of Credit (month),Purpose,Credit Amount
0,1,18,2,1049
1,1,9,0,2799
2,1,12,9,841
3,1,12,0,2122
4,1,12,0,2171
...,...,...,...,...
995,0,24,3,1987
996,0,24,0,2303
997,0,21,0,12680
998,0,12,3,6468


In [82]:
german_sample.min()

Creditability                   0
Duration of Credit (month)      4
Purpose                         0
Credit Amount                 250
dtype: int64

In [83]:
german_sample.max()

Creditability                     1
Duration of Credit (month)       72
Purpose                          10
Credit Amount                 18424
dtype: int64

In [84]:
german_sample.mean()

Creditability                    0.700
Duration of Credit (month)      20.903
Purpose                          2.828
Credit Amount                 3271.248
dtype: float64

In [85]:
german_sample.describe   # 데이터 전체 요약 정보

<bound method NDFrame.describe of      Creditability  Duration of Credit (month)  Purpose  Credit Amount
0                1                          18        2           1049
1                1                           9        0           2799
2                1                          12        9            841
3                1                          12        0           2122
4                1                          12        0           2171
..             ...                         ...      ...            ...
995              0                          24        3           1987
996              0                          24        0           2303
997              0                          21        0          12680
998              0                          12        3           6468
999              0                          30        2           6350

[1000 rows x 4 columns]>

In [86]:
german_sample=german[['Duration of Credit (month)', 'Credit Amount', 'Age (years)' ]]

In [88]:
german_sample.corr()        # 상관계수  ( '1'= 100% 관련있다 )

Unnamed: 0,Duration of Credit (month),Credit Amount,Age (years)
Duration of Credit (month),1.0,0.624988,-0.03755
Credit Amount,0.624988,1.0,0.032273
Age (years),-0.03755,0.032273,1.0


In [121]:
german_sample = german[['Credit Amount','Type of apartment']]

In [122]:
german_sample

Unnamed: 0,Credit Amount,Type of apartment
0,1049,1
1,2799,1
2,841,1
3,2122,1
4,2171,2
...,...,...
995,1987,1
996,2303,2
997,12680,3
998,6468,2


In [124]:
german_grouped = german_sample['Credit Amount'].groupby(german_sample['Type of apartment']) #주거종류에 따라 채권을 보고 싶음

In [125]:
german_grouped.mean()  # 그룹별로 '평균'을 보고 싶다고 명시해줘야 함

Type of apartment
1    3122.553073
2    3067.257703
3    4881.205607
Name: Credit Amount, dtype: float64

In [98]:
german_sample = german[['Credit Amount','Type of apartment', 'Purpose']]

In [102]:
german_grouped = german_sample['Credit Amount'].groupby(
[german_sample['Purpose'],
german_sample['Type of apartment']])
german_grouped                         #기준을 하나 더 추가

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000025635FF4250>

In [103]:
german_grouped.mean()

Purpose  Type of apartment
0        1                    2597.225000
         2                    2811.024242
         3                    5138.689655
1        1                    5037.086957
         2                    4915.222222
         3                    6609.923077
2        1                    2727.354167
         2                    3107.450820
         3                    4100.181818
3        1                    2199.763158
         2                    2540.533040
         3                    2417.333333
4        1                    1255.500000
         2                    1546.500000
5        1                    1522.000000
         2                    2866.000000
         3                    2750.666667
6        1                    3156.444444
         2                    2492.423077
         3                    4387.266667
8        1                     902.000000
         2                    1243.875000
9        1                    5614.125000
       