In [None]:
!pip install pandas



### Series 데이터: 1차원 데이터
파이썬의 list와 유사함

In [None]:
import pandas as pd

data = ['apple', 'banana', 'computer', 'domino', 'elephant']
s = pd.Series(data)
print(s)

0       apple
1      banana
2    computer
3      domino
4    elephant
dtype: object


### DataFrame 데이터: 2차원 데이터
여러개의 Series 데이터를 묶은 형태

In [None]:
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]
df = pd.DataFrame(data)
print(df)

       0   1  2
0   John  21  M
1   Sara  30  F
2    Tom  25  M
3  Peter  34  M


DataFrame을 print() 하면 자동으로 ‘행(=index)’과 ‘열(=column)’의 이름이 0번부터 번호가 매겨집니다.


행, 열에 각각 이름을 부여할 수 있습니다.

In [None]:
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]
df = pd.DataFrame(data, columns=['name', 'age', 'gender'])
print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M


In [None]:
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]
df = pd.DataFrame(data)

# 컬럼은 추후에 만들어줘도 괜찮습니다
df.columns = ['name', 'age', 'gender']
print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M


In [None]:
# 인덱스 추가
df.index = ["1번", "2번", "3번", "4번"]
print(df)


     name  age gender
1번   John   21      M
2번   Sara   30      F
3번    Tom   25      M
4번  Peter   34      M


In [None]:
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]
df = pd.DataFrame(data, columns=['name', 'age', 'gender'], index = ["1번", "2번", "3번", "4번"])
print(df)

     name  age gender
1번   John   21      M
2번   Sara   30      F
3번    Tom   25      M
4번  Peter   34      M


index이름과 column이름을 추가할 때, 반드시 df의 column or index 개수와 일치해야합니다. 

그렇지 않을시 에러가 발생합니다.

In [None]:
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]
df = pd.DataFrame(data, columns=['name', 'age', 'gender'], index = ["1번", "2번", "3번", "4번", "5번"])
print(df)

ValueError: Length of values (4) does not match length of index (5)

In [None]:
# index, column 개수를 알고싶다면 shape 속성을 사용하시면 됩니다.

df = pd.DataFrame(data)
# (행 개수, 열 개수)
print(df.shape)

(4, 3)


### DataFrame 열추가

In [None]:
# insert()
# insert(column_index, column_name, value)   # 3가지 파라미터 필요
# DataFrame에 컬럼을 한 개 추가합니다.

data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]

columns = ['name', 'age', 'gender']
df = pd.DataFrame(data, columns=columns)

print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M


In [None]:
# scores 컬럼 추가
scores = [9, 8, 10, 9]
df.insert(df.shape[1], 'score', scores)

print(df)

    name  age gender  score
0   John   21      M      9
1   Sara   30      F      8
2    Tom   25      M     10
3  Peter   34      M      9


In [None]:
# 모든 값이 동일하다면, 리스트로 전달하지 않아도 됩니다.
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]

columns = ['name', 'age', 'gender']
df = pd.DataFrame(data, columns=columns)

print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M


In [None]:
df.insert(df.shape[1], 'score', 0)
print(df)

    name  age gender  score
0   John   21      M      0
1   Sara   30      F      0
2    Tom   25      M      0
3  Peter   34      M      0


In [None]:
# 다른 추가 방법 1
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]

columns = ['name', 'age', 'gender']
df = pd.DataFrame(data, columns=columns)

print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M


In [None]:
# 리스트로 추가
scores = [9, 8, 10, 9]
df['score'] = scores
print(df)

    name  age gender  score
0   John   21      M      9
1   Sara   30      F      8
2    Tom   25      M     10
3  Peter   34      M      9


In [None]:
# 모두 동일한 값으로 초기화하여 추가
df['score'] = 0
print(df)

    name  age gender  score
0   John   21      M      0
1   Sara   30      F      0
2    Tom   25      M      0
3  Peter   34      M      0


### DataFrame 행 추가

In [None]:
# 행 추가시 append 사용
# 먼저, append의 정확한 사용법에 앞서 다음 코드와 수행결과를 관찰해봅시다.

data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]

df = pd.DataFrame(data)
new_person = ['Max', 33, 'F']
print(df)
print()

new_df = df.append(new_person, ignore_index=True)
print(new_df)

       0   1  2
0   John  21  M
1   Sara  30  F
2    Tom  25  M
3  Peter  34  M

       0     1    2
0   John  21.0    M
1   Sara  30.0    F
2    Tom  25.0    M
3  Peter  34.0    M
4    Max   NaN  NaN
5     33   NaN  NaN
6      F   NaN  NaN


  new_df = df.append(new_person, ignore_index=True)


In [None]:
# 한 개의 행을 추가하길 바랐지만 기대한 모양과 완전히 다른 것을 확인할 수 있습니다.
    
# DataFrame = Series의 집합체
    
# 한 개의 column = 한 개의 Series / 즉, 행단위가 아닌 열단위로 Series가 모여있는 형태

case 1)
       0 |    1  |  2
0   John | 21.0  |  M
1   Sara | 30.0  |  F
2    Tom | 25.0  |  M
3  Peter | 34.0  |  M

df는 위와 같이 3개의 Series가 모여있는 형태입니다

case 2)
      0   1  2
0   John  21  M
----------------
1   Sara  30  F
----------------
2    Tom  25  M
----------------
3  Peter  34  M

이렇게 4개의 Series가 모여있는 것이 아닙니다


즉, list를 DataFrame에 추가하는 것은 Series를 하나 추가한다고 봐도 무방합니다.

결론적으로 case 2)와 같은 형태라면, 아래쪽에 데이터를 추가하는 함수인 append()를 사용했을 때 깔끔하게 한 줄이 추가되겠지만, 실제로는 case 1)과 같은 형태이므로 위 코드와 같이 괴상한 형태로 추가가 됩니다.

In [None]:
# 원하는 결과를 얻기 위해서는 다음과 같이 행을 추가합니다.
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]

columns = ['name', 'age', 'gender']
df = pd.DataFrame(data, columns=columns)

# dictionary 형태로 추가
new_person = {'name': 'Max', 'age': 33, 'gender': 'F'}
df = df.append(new_person, ignore_index=True)
print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M
4    Max   33      F


  df = df.append(new_person, ignore_index=True)


### DataFrame간의 합

In [None]:
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]

columns = ['name', 'age', 'gender']
df1 = pd.DataFrame(data, columns=columns)

data = [[90, 88, 89],
        [70, 75, 90],
        [83, 89, 85],
        [72, 95, 90]]

columns = ['korean', 'english', 'math']
df2 = pd.DataFrame(data, columns=columns)

In [None]:
print(df1)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M


In [None]:
print(df2)

   korean  english  math
0      90       88    89
1      70       75    90
2      83       89    85
3      72       95    90


In [None]:
con_df = pd.concat([df1, df2], axis=1)
print(con_df)

    name  age gender  korean  english  math
0   John   21      M      90       88    89
1   Sara   30      F      70       75    90
2    Tom   25      M      83       89    85
3  Peter   34      M      72       95    90


concat( 합성할 DataFrame들 리스트, axis=0 or 1)

axis=0 : 상, 하 방향 합성 / column명이 같으면 한 column으로 합성

axis=1 : 좌, 우 방향 합성 / index명이 같으면 한 index로 합성

axis 파라미터는 pandas의 많은 함수들에서 등장합니다. axis=0 이라면 ‘행(index)’관련, axis=1 이라면 ‘열(columns)’관련 일을 처리한다고 생각하시면 됩니다.

In [None]:
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]

columns = ['name', 'age', 'gender']
df1 = pd.DataFrame(data, columns=columns)

data = [[90, 88, 89],
        [70, 75, 90],
        [83, 89, 85],
        [72, 95, 90]]

columns = ['korean', 'english', 'math']
# df2에 인덱스를 변경
df2 = pd.DataFrame(data, columns=columns, index = [2, 3, 4, 5])

In [None]:
print(df1)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M


In [None]:
print(df2)

   korean  english  math
2      90       88    89
3      70       75    90
4      83       89    85
5      72       95    90


In [None]:
con_df = pd.concat([df1, df2], axis=1)
print(con_df)

# index 2, 3은 합성되고 나머지는 독립적
# 빈 칸은 NaN(=None =Null)으로 채워짐

    name   age gender  korean  english  math
0   John  21.0      M     NaN      NaN   NaN
1   Sara  30.0      F     NaN      NaN   NaN
2    Tom  25.0      M    90.0     88.0  89.0
3  Peter  34.0      M    70.0     75.0  90.0
4    NaN   NaN    NaN    83.0     89.0  85.0
5    NaN   NaN    NaN    72.0     95.0  90.0


In [None]:
data = [['John', 21, 'M'],
        ['Sara', 30, 'F'],
        ['Tom', 25, 'M'],
        ['Peter', 34, 'M']]

columns = ['name', 'age', 'gender']
df1 = pd.DataFrame(data, columns=columns)

data = [['Max', 33, 'M'],
        ['Bob', 27, 'F']]

columns = ['name', 'age', 'gender']
df2 = pd.DataFrame(data, columns=columns)

In [None]:
print(df1)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M


In [None]:
print(df2)

  name  age gender
0  Max   33      M
1  Bob   27      F


In [None]:
# axis = 0
con_df = pd.concat([df1, df2], axis=0)
print(con_df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M
0    Max   33      M
1    Bob   27      F


합성된 모양은 원하는 모양과 일치하지만, index가 이상한 것을 확인할 수 있습니다.

ignore_index=True 옵션을 전달하면 합성하려는 모든 DataFrame들의 index를 무시하고 0부터 차례대로 index 번호를 부여합니다. (default 값은 False)

In [None]:
con_df = pd.concat([df1, df2], axis=0, ignore_index=True)
print(con_df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M
4    Max   33      M
5    Bob   27      F


### DataFrame 내용 삭제

pd.drop()

행 삭제 pd.drop( [삭제하고자 하는 행 이름들], axis=0 )

열 삭제 pd.drop( [삭제하고자 하는 열 이름들], axis=1 )

In [None]:
# 원본 훼손 방지를 위해 깊은 복사 이용
df = con_df.copy()

In [None]:
print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M
4    Max   33      M
5    Bob   27      F


In [None]:
# 2, 3 번째 행을 삭제하는 예제
df = df.drop([2, 3], axis=0)
print(df)

   name  age gender
0  John   21      M
1  Sara   30      F
4   Max   33      M
5   Bob   27      F


In [None]:
# age열 삭제
# 삭제하고자 하는 대상이 한줄이라면 리스트에 담지 않아도 됩니다.
df = df.drop('age', axis=1)
print(df)

   name gender
0  John      M
1  Sara      F
4   Max      M
5   Bob      F


만약 column명을 하나하나 명시하기 귀찮다면 다음과 같이 할 수 있습니다.

df.columns ⇒ 컬럼명을 반복가능한 객체에 담아 반환

df.columns[n:m+1] ⇒ list 슬라이싱과 동일하게 n번째 컬럼~m번째 컬럼만을 슬라이싱하여 반환 (참고: 여기서 말하는 n번째는 list인덱스와 동일하게 0번부터 시작하는 번호입니다.)

In [None]:
df = con_df.copy()
print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M
4    Max   33      M
5    Bob   27      F


In [None]:
df = df.drop(df.columns[1:3], axis=1)  # 1, 2번째 column 삭제
print(df)

    name
0   John
1   Sara
2    Tom
3  Peter
4    Max
5    Bob


In [None]:
df = con_df.copy()
print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M
4    Max   33      M
5    Bob   27      F


In [None]:
df = df.drop(df.index[2:5], axis=0)  # 2 ~ 4번째 index 삭제
print(df)

   name  age gender
0  John   21      M
1  Sara   30      F
5   Bob   27      F


### csv 혹은 excel 데이터 DataFrame으로 불러오기

In [None]:
file_path = r'./pd_metadata.csv'
df = pd.read_csv(file_path, encoding='cp949')
print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M
4    Max   33      M
5    Bob   27      F
6    Kim   30      F


In [None]:
# 첫 줄(메타데이터)은 column명으로 자동으로 들어갑니다. 그러면 만약 메타데이더가 없는 형태라면?
file_path = r'./pd_no_metadata.csv'
df = pd.read_csv(file_path, encoding='cp949')
print(df)

# metadata가 없는 형태의 csv파일을 이전 코드와 동일한 방식으로 불러오게 되면 첫 줄이 column명으로 들어가는 문제가 발생합니다.

    John  21  M
0   Sara  30  F
1    Tom  25  M
2  Peter  34  M
3    Max  33  M
4    Bob  27  F
5    Kim  30  F


In [None]:
file_path = r'./pd_no_metadata.csv'
# header=None 옵션을 주면 column명이 자동으로 0부터 들어가게 됩니다.
df = pd.read_csv(file_path, encoding='cp949', header=None)
print(df)

       0   1  2
0   John  21  M
1   Sara  30  F
2    Tom  25  M
3  Peter  34  M
4    Max  33  M
5    Bob  27  F
6    Kim  30  F


In [None]:
# 만약 column명을 따로 지정해주고 싶다면 names=[] 으로 지정해주시면 됩니다.
file_path = r'./pd_no_metadata.csv'
df = pd.read_csv(file_path, encoding='cp949', header=None, names=['name', 'age', 'gender'])
print(df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M
4    Max   33      M
5    Bob   27      F
6    Kim   30      F


엑셀파일을 읽어오는 방식도 csv파일과 함수만 다를 뿐 내용이 완전히 동일합니다.
    
read_csv( ) ⇒ read_excel( ) 로만 바꿔서 나머지 내용 동일하게 읽어오시면 됩니다.

### DataFrame을 csv 혹은 excel파일로 저장하기

In [None]:
print(con_df)

    name  age gender
0   John   21      M
1   Sara   30      F
2    Tom   25      M
3  Peter   34      M
4    Max   33      M
5    Bob   27      F


In [None]:
save_path = r'./my_first_pandas.csv'
con_df.to_csv(save_path)

In [None]:
# 생성된 csv 파일을 보면 index 번호도 포함되어 있습니다. 
# 이는 index=False 옵션을 주어 해결할 수 있습니다.
save_path = r'./my_first_pandas2.csv'
con_df.to_csv(save_path, index=False)

In [None]:
# 엑셀 파일로 바꾸는 방법은 to_excel() 함수를 사용하면 됩니다.

save_path = r'./my_first_pandas.xlsx'
con_df.to_excel(save_path, index=False)

### iloc함수로 입맛에 맞게 DataFrame 재구성하기

df.iloc[ 행 슬라이싱, 열 슬라이싱 ] ⇒ ( ) 가 아닌 [ ] 를 사용합니다.

‘행 슬라이싱’ 혹은 ‘열 슬라이싱’ 부분에는 파이썬의 슬라이싱 기법을 사용하시면 됩니다.

슬라이싱할 수 있는 형태는 2가지가 있음. **[start:end] 형태**와 **[start: end: step] 형태**

[start:end] 형태

  [:5] ⇒ 처음부터 4번째까지 즉, 0 1 2 3 4 를 뜻함

  [4: -2] ⇒ 처음부터 뒤에서 3번째까지

[start: end: step] 형태

  [1:7:2] ⇒ 1부터 6까지 2씩 건너뛰며 슬라이싱 즉, 1 3 5 를 뜻함

  [:10:3] ⇒ 처음부터 9까지 3씩 건너뛰며 슬라이싱 즉, 0 3 6 9 를 뜻함

In [None]:
file_path = r'./breast_cancer.csv'
df = pd.read_csv(file_path, encoding='cp949')
print(df)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

In [None]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
df.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


In [None]:
df.head(10)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [None]:
df['mean radius'].head(10)

0    17.99
1    20.57
2    19.69
3    11.42
4    20.29
5    12.45
6    18.25
7    13.71
8    13.00
9    12.46
Name: mean radius, dtype: float64

In [None]:
# 시리즈 반환
type(df['mean radius'].head(10))

pandas.core.series.Series

In [None]:
# 데이터프레임
df[['mean radius', 'mean texture']].head(10)

Unnamed: 0,mean radius,mean texture
0,17.99,10.38
1,20.57,17.77
2,19.69,21.25
3,11.42,20.38
4,20.29,14.34
5,12.45,15.7
6,18.25,19.98
7,13.71,20.83
8,13.0,21.82
9,12.46,24.04


df['mean radius', ].head(10)

In [None]:
type(df[['mean radius', 'mean texture']].head(10))

pandas.core.frame.DataFrame

In [None]:
# 시리즈이므로 잘 동작합니다
df['mean radius'].head(10)[0]

17.99

In [None]:
# 데이터프레임이므로 동작하지 않음. 다른 방법을 써야합니다
df[['mean radius', 'mean texture']].head(10)[0]

KeyError: 0

In [None]:
# df.iloc[행 인덱스, 열 인덱스]

In [None]:
# 0행 출력
df.iloc[0]

mean radius                  17.990000
mean texture                 10.380000
mean perimeter              122.800000
mean area                  1001.000000
mean smoothness               0.118400
mean compactness              0.277600
mean concavity                0.300100
mean concave points           0.147100
mean symmetry                 0.241900
mean fractal dimension        0.078710
radius error                  1.095000
texture error                 0.905300
perimeter error               8.589000
area error                  153.400000
smoothness error              0.006399
compactness error             0.049040
concavity error               0.053730
concave points error          0.015870
symmetry error                0.030030
fractal dimension error       0.006193
worst radius                 25.380000
worst texture                17.330000
worst perimeter             184.600000
worst area                 2019.000000
worst smoothness              0.162200
worst compactness        

In [None]:
# 컬럼들
df.iloc[:0]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension


In [None]:
df.iloc[0,1]

10.38

In [None]:
# 슬라이싱이 가능합니다

# 0~2행 출력
df.iloc[0:3]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [None]:
# 0~2행을 출력하되, 0~1컬럼까지만 출력
df.iloc[0:3, 0:2]

Unnamed: 0,mean radius,mean texture
0,17.99,10.38
1,20.57,17.77
2,19.69,21.25


In [None]:
# 0~4행을 출력하되, 0~4컬럼까지만 출력
df.iloc[:5, :5]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness
0,17.99,10.38,122.8,1001.0,0.1184
1,20.57,17.77,132.9,1326.0,0.08474
2,19.69,21.25,130.0,1203.0,0.1096
3,11.42,20.38,77.58,386.1,0.1425
4,20.29,14.34,135.1,1297.0,0.1003


In [None]:
# [start: end: step] 형태
df.iloc[::2, :]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
6,18.25,19.98,119.60,1040.0,0.09463,0.10900,0.11270,0.07400,0.1794,0.05742,...,22.880,27.66,153.20,1606.0,0.14420,0.25760,0.3784,0.1932,0.3063,0.08368
8,13.00,21.82,87.50,519.8,0.12730,0.19320,0.18590,0.09353,0.2350,0.07389,...,15.490,30.73,106.20,739.3,0.17030,0.54010,0.5390,0.2060,0.4378,0.10720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,14.05,27.15,91.38,600.4,0.09929,0.11260,0.04462,0.04304,0.1537,0.06171,...,15.300,33.17,100.20,706.7,0.12410,0.22640,0.1326,0.1048,0.2250,0.08321
562,15.22,30.62,103.40,716.9,0.10480,0.20870,0.25500,0.09429,0.2128,0.07152,...,17.520,42.79,128.70,915.0,0.14170,0.79170,1.1700,0.2356,0.4089,0.14090
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820


In [None]:
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075
