## [ DataFrame 정렬 ]
- 종류 => 인덱스기반 정렬, 데이터/값기반 정렬

- (1) 모듈 로딩 

In [1]:
import pandas as pd

- (2) 데이터 준비

In [37]:
datafile = '../DATA/movies.csv'

- (3) 데이터 저장 => DataFrame
    * 파일 데이터 읽어서 저장 => pandas.read_파일확장자() == 반환 : DataFrame

In [38]:
movieDF = pd.read_csv(datafile)

- (4) 데이터 확인

In [39]:
movieDF

Unnamed: 0,Rank,Title,Studio,Gross,Year
0,1,Avengers: Endgame,Buena Vista,"$2,796.30",2019
1,2,Avatar,Fox,"$2,789.70",2009
2,3,Titanic,Paramount,"$2,187.50",1997
3,4,Star Wars: The Force Awakens,Buena Vista,"$2,068.20",2015
4,5,Avengers: Infinity War,Buena Vista,"$2,048.40",2018
...,...,...,...,...,...
777,778,Yogi Bear,Warner Brothers,$201.60,2010
778,779,Garfield: The Movie,Fox,$200.80,2004
779,780,Cats & Dogs,Warner Brothers,$200.70,2001
780,781,The Hunt for Red October,Paramount,$200.50,1990


-  데이터 확인 관련 메소드

In [40]:
# 데이터의 전제적인 구조 및 데이터 종류 요약
movieDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rank    782 non-null    int64 
 1   Title   782 non-null    object
 2   Studio  782 non-null    object
 3   Gross   782 non-null    object
 4   Year    782 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 30.7+ KB


In [41]:
# 컬럼별 통계적인 데이터 정보 제공
movieDF.describe(include='all')

Unnamed: 0,Rank,Title,Studio,Gross,Year
count,782.0,782,782,782,782.0
unique,,773,37,701,
top,,Beauty and the Beast,Warner Brothers,$225.90,
freq,,2,132,3,
mean,391.5,,,,2006.620205
std,225.888247,,,,10.026227
min,1.0,,,,1939.0
25%,196.25,,,,2001.0
50%,391.5,,,,2009.0
75%,586.75,,,,2014.0


In [42]:
# 수치 컬럼의 최소, 최대, 평균 데이터 추출
df1 = movieDF.describe().loc[['min','max','mean']]
print(df1)

       Rank         Year
min     1.0  1939.000000
max   782.0  2019.000000
mean  391.5  2006.620205


- ## 행=>열, 열=>행 치환 기능 속성 및 메소드

In [33]:
pd.DataFrame([df1.Rank.values, df1.Year.values], columns = df1.index, index=df1.columns)

Unnamed: 0,min,max,mean
Rank,1.0,782.0,391.5
Year,1939.0,2019.0,2006.620205


- 속성 : DF객체변수명.T => 원본 데이터 변경 안됨!

In [34]:
df1.T

Unnamed: 0,min,max,mean
Rank,1.0,782.0,391.5
Year,1939.0,2019.0,2006.620205


- 메소드: DF객체변수명.transpose()

In [35]:
df1.transpose()

Unnamed: 0,min,max,mean
Rank,1.0,782.0,391.5
Year,1939.0,2019.0,2006.620205


- (5) 데이터 정렬
    * 인덱스 기반 정렬 = 행인덱스 / 열인덱스

In [43]:
# 행 인덱스 기반 정렬 => DF.sort_index()
movieDF.sort_index()

Unnamed: 0,Rank,Title,Studio,Gross,Year
0,1,Avengers: Endgame,Buena Vista,"$2,796.30",2019
1,2,Avatar,Fox,"$2,789.70",2009
2,3,Titanic,Paramount,"$2,187.50",1997
3,4,Star Wars: The Force Awakens,Buena Vista,"$2,068.20",2015
4,5,Avengers: Infinity War,Buena Vista,"$2,048.40",2018
...,...,...,...,...,...
777,778,Yogi Bear,Warner Brothers,$201.60,2010
778,779,Garfield: The Movie,Fox,$200.80,2004
779,780,Cats & Dogs,Warner Brothers,$200.70,2001
780,781,The Hunt for Red October,Paramount,$200.50,1990


In [44]:
movieDF.sort_index(ascending=False)

Unnamed: 0,Rank,Title,Studio,Gross,Year
781,782,Valkyrie,MGM,$200.30,2008
780,781,The Hunt for Red October,Paramount,$200.50,1990
779,780,Cats & Dogs,Warner Brothers,$200.70,2001
778,779,Garfield: The Movie,Fox,$200.80,2004
777,778,Yogi Bear,Warner Brothers,$201.60,2010
...,...,...,...,...,...
4,5,Avengers: Infinity War,Buena Vista,"$2,048.40",2018
3,4,Star Wars: The Force Awakens,Buena Vista,"$2,068.20",2015
2,3,Titanic,Paramount,"$2,187.50",1997
1,2,Avatar,Fox,"$2,789.70",2009


In [47]:
# 컬럼 인덱스 기반 정렬 => 내림차순
movieDF.sort_index(axis='columns' ,ascending=False)

Unnamed: 0,Year,Title,Studio,Rank,Gross
0,2019,Avengers: Endgame,Buena Vista,1,"$2,796.30"
1,2009,Avatar,Fox,2,"$2,789.70"
2,1997,Titanic,Paramount,3,"$2,187.50"
3,2015,Star Wars: The Force Awakens,Buena Vista,4,"$2,068.20"
4,2018,Avengers: Infinity War,Buena Vista,5,"$2,048.40"
...,...,...,...,...,...
777,2010,Yogi Bear,Warner Brothers,778,$201.60
778,2004,Garfield: The Movie,Fox,779,$200.80
779,2001,Cats & Dogs,Warner Brothers,780,$200.70
780,1990,The Hunt for Red October,Paramount,781,$200.50


In [48]:
movieDF.sort_index(axis='columns')

Unnamed: 0,Gross,Rank,Studio,Title,Year
0,"$2,796.30",1,Buena Vista,Avengers: Endgame,2019
1,"$2,789.70",2,Fox,Avatar,2009
2,"$2,187.50",3,Paramount,Titanic,1997
3,"$2,068.20",4,Buena Vista,Star Wars: The Force Awakens,2015
4,"$2,048.40",5,Buena Vista,Avengers: Infinity War,2018
...,...,...,...,...,...
777,$201.60,778,Warner Brothers,Yogi Bear,2010
778,$200.80,779,Fox,Garfield: The Movie,2004
779,$200.70,780,Warner Brothers,Cats & Dogs,2001
780,$200.50,781,Paramount,The Hunt for Red October,1990


- (5) 데이터 정렬
    * 데이터/값 기반 정렬

In [50]:
movieDF.sort_values(by='Year')

Unnamed: 0,Rank,Title,Studio,Gross,Year
287,288,Gone with the Wind,MGM,$402.40,1939
539,540,Bambi,RKO,$267.40,1942
707,708,101 Dalmatians,Buena Vista,$215.90,1961
754,755,The Jungle Book,Buena Vista,$205.80,1967
603,604,The Godfather,Paramount,$245.10,1972
...,...,...,...,...,...
685,686,Men in Black International,Sony,$220.80,2019
457,458,John Wick: Chapter 3 - Parabellum,Lionsgate,$304.70,2019
262,263,Pokemon Detective Pikachu,Warner Brothers,$427.50,2019
602,603,Dark Phoenix,Fox,$245.10,2019


In [51]:
movieDF.sort_values(by=['Year', 'Title'])

Unnamed: 0,Rank,Title,Studio,Gross,Year
287,288,Gone with the Wind,MGM,$402.40,1939
539,540,Bambi,RKO,$267.40,1942
707,708,101 Dalmatians,Buena Vista,$215.90,1961
754,755,The Jungle Book,Buena Vista,$205.80,1967
603,604,The Godfather,Paramount,$245.10,1972
...,...,...,...,...,...
339,340,Shazam!,Warner Brothers,$364.10,2019
669,670,The Secret Life of Pets 2,Universal,$225.90,2019
113,114,The Wandering Earth,China Film Corporation,$699.80,2019
197,198,Toy Story 4,Buena Vista,$519.80,2019


## [실습] 다양한 정렬

In [57]:
import numpy as np
from math import nan               # nan : Not a number 빈칸 의미
data = {'col1': ['A', 'A', 'B', nan, 'D', 'C'],
        'col2': [2, 1, 9, 8, 7, 4],
        'col3': [0, 1, 9, 4, 2, 3],
        'col4': ['a', 'B', 'c', 'D', 'e', 'F']
}


In [58]:
dataDF = pd.DataFrame(data)
dataDF

Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
3,,8,4,D
4,D,7,2,e
5,C,4,3,F


- col1 컬럼을 기준으로 오름차순 정렬

In [59]:
# NaN 즉, 빈칸 데이터 (결측치) 값의 위치 : (기) 제일 마지막
dataDF.sort_values(by='col1')

Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
5,C,4,3,F
4,D,7,2,e
3,,8,4,D


In [60]:
# NaN 즉, 빈칸 데이터 (결측치) 값의 위치 설정 => na_position 파라미터
dataDF.sort_values(by='col1', na_position='first')

Unnamed: 0,col1,col2,col3,col4
3,,8,4,D
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
5,C,4,3,F
4,D,7,2,e


In [61]:
# col4 컬럼을 기준으로 오름차순 정렬
dataDF.sort_values(by='col4')

Unnamed: 0,col1,col2,col3,col4
1,A,1,1,B
3,,8,4,D
5,C,4,3,F
0,A,2,0,a
2,B,9,9,c
4,D,7,2,e


In [62]:
dataDF.sort_values(by='col4', key=lambda col:col.str.lower())

Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
3,,8,4,D
4,D,7,2,e
5,C,4,3,F
