In [2]:
import pandas as pd

data = {
    "Name": [
        "John Doe",
        "Jane Smith",
        "Peter Jones",
        "Linda Taylor",
        "Emily Clark",
        "David Smith",
        "Michael Brown",
        "Sarah Johnson",
    ],
    "Position": [
        "Software Engineer",
        "Data Scientist",
        "Project Manager",
        "HR Specialist",
        "Data Scientist",
        "Software Engineer",
        "HR Specialist",
        "Project Manager",
    ],
    "Office": [
        "New York",
        "San Francisco",
        "Berlin",
        "London",
        "San Francisco",
        "New York",
        "London",
        "Berlin",
    ],
    "Age": [28, 34, 45, 32, 30, 38, 41, 29],
    "Salary": [95000, 125000, 75000, 85000, 115000, 105000, 90000, 80000],
}

df = pd.DataFrame(data)

print(df.head())
print(df.tail())

           Name           Position         Office  Age  Salary
0      John Doe  Software Engineer       New York   28   95000
1    Jane Smith     Data Scientist  San Francisco   34  125000
2   Peter Jones    Project Manager         Berlin   45   75000
3  Linda Taylor      HR Specialist         London   32   85000
4   Emily Clark     Data Scientist  San Francisco   30  115000
            Name           Position         Office  Age  Salary
3   Linda Taylor      HR Specialist         London   32   85000
4    Emily Clark     Data Scientist  San Francisco   30  115000
5    David Smith  Software Engineer       New York   38  105000
6  Michael Brown      HR Specialist         London   41   90000
7  Sarah Johnson    Project Manager         Berlin   29   80000


In [3]:
# Age 열 선택
print(df["Age"])

# Name, Age 2개 열 선택
print(df[["Name", "Age"]])

0    28
1    34
2    45
3    32
4    30
5    38
6    41
7    29
Name: Age, dtype: int64
            Name  Age
0       John Doe   28
1     Jane Smith   34
2    Peter Jones   45
3   Linda Taylor   32
4    Emily Clark   30
5    David Smith   38
6  Michael Brown   41
7  Sarah Johnson   29


In [4]:
# 레이블(라벨, label) 기반 선택 .loc


# 단일 행 선택
print(df.loc[4])

# 범위 행 선택
print(
    df.loc[2:4]
)  # 대괄호 속 숫자는 위치가 아니고 인덱스(행의 이름) 이므로 슬라이싱처럼 되지 않고 끝 행 번호도 포함한다.

# 복수 행 선택
print(df.loc[[0, 2, 4]])

# 단일 열 선택
print(df.loc[:, "Name"])  # 모든 행에 대한 Name 행의 데이터를 출력

# 복수 열 선택, 모든 행에 대한 Name, Age 열 선택 코드 작성
print(df.loc[:, ["Name", "Age"]])

# 1,3,5번 행에 대한 Name, Salary 열 선택 출력 코드
print(df.loc[[1, 3, 5], ["Name", "Salary"]])

# 모든 행에 대한 범위 열 선택 (슬라이싱)
print(df.loc[:, "Name":"Office"])

Name           Emily Clark
Position    Data Scientist
Office       San Francisco
Age                     30
Salary              115000
Name: 4, dtype: object
           Name         Position         Office  Age  Salary
2   Peter Jones  Project Manager         Berlin   45   75000
3  Linda Taylor    HR Specialist         London   32   85000
4   Emily Clark   Data Scientist  San Francisco   30  115000
          Name           Position         Office  Age  Salary
0     John Doe  Software Engineer       New York   28   95000
2  Peter Jones    Project Manager         Berlin   45   75000
4  Emily Clark     Data Scientist  San Francisco   30  115000
0         John Doe
1       Jane Smith
2      Peter Jones
3     Linda Taylor
4      Emily Clark
5      David Smith
6    Michael Brown
7    Sarah Johnson
Name: Name, dtype: object
            Name  Age
0       John Doe   28
1     Jane Smith   34
2    Peter Jones   45
3   Linda Taylor   32
4    Emily Clark   30
5    David Smith   38
6  Michael Brown  

In [5]:
# 위치 기반 인덱싱 : iloc : 리스트와 동일

# 단일 행 선택
print(df.iloc[0])

# 범위 행 선택
print(df.iloc[0:5])  # loc와 달리 흔한 인덱싱 같이 끝 번호는 포함 x

# 복수 행 선택
print(df.iloc[[0, 2]])

# 모든 행에 대한 첫 번째 열을 선택해서 출력
print(df.iloc[:, 0])

# 범위 열 지정
print(df.iloc[:, 0:3])

Name                 John Doe
Position    Software Engineer
Office               New York
Age                        28
Salary                  95000
Name: 0, dtype: object
           Name           Position         Office  Age  Salary
0      John Doe  Software Engineer       New York   28   95000
1    Jane Smith     Data Scientist  San Francisco   34  125000
2   Peter Jones    Project Manager         Berlin   45   75000
3  Linda Taylor      HR Specialist         London   32   85000
4   Emily Clark     Data Scientist  San Francisco   30  115000
          Name           Position    Office  Age  Salary
0     John Doe  Software Engineer  New York   28   95000
2  Peter Jones    Project Manager    Berlin   45   75000
0         John Doe
1       Jane Smith
2      Peter Jones
3     Linda Taylor
4      Emily Clark
5      David Smith
6    Michael Brown
7    Sarah Johnson
Name: Name, dtype: object
            Name           Position         Office
0       John Doe  Software Engineer       New Yor

In [6]:
# 필터링(Filtering)

# 부서(Position) Data Scientist인 데이터만 추출

# Position을 기준으로 Data Scientist인지 각 행을 평가(참/거짓)
print(df['Position'] == "Data Scientist") # pandas에서 일종의 조건문 역할을 함

print(df[df["Position"] == "Data Scientist"]) # 조건문을 인덱싱 : True인 행만 필터링

# Age가 30 초과인 행만 필터링 코드를 작성
print(df[df["Age"] > 30])

# 복수 조건
# 파이썬 논리연산자 and or (x)
# 판다스 논리 연산자 &(앰퍼샌드) / |(파이프)

# Position이 Data Scientist, Age가 40보다 큰
print((df["Position"] == "Software Engineer") & (df["Age"] > 30))  # 이중 조건문


print(df[(df["Position"] == "Software Engineer") & (df["Age"] > 30)])

# Office가 New York 이거나 Age가 40 보다 큰 행만 출력하는 코드
print(df[(df["Office"] == "New York") | (df["Age"] > 40)])

0    False
1     True
2    False
3    False
4     True
5    False
6    False
7    False
Name: Position, dtype: bool
          Name        Position         Office  Age  Salary
1   Jane Smith  Data Scientist  San Francisco   34  125000
4  Emily Clark  Data Scientist  San Francisco   30  115000
            Name           Position         Office  Age  Salary
1     Jane Smith     Data Scientist  San Francisco   34  125000
2    Peter Jones    Project Manager         Berlin   45   75000
3   Linda Taylor      HR Specialist         London   32   85000
5    David Smith  Software Engineer       New York   38  105000
6  Michael Brown      HR Specialist         London   41   90000
0    False
1    False
2    False
3    False
4    False
5     True
6    False
7    False
dtype: bool
          Name           Position    Office  Age  Salary
5  David Smith  Software Engineer  New York   38  105000
            Name           Position    Office  Age  Salary
0       John Doe  Software Engineer  New York   28

In [7]:
# isin 필터링
# Position이 Sorfware Engineer, Data Scientist, HR Specialist
filtering = df["Position"].isin(["Software Engineer", "Data Scientist", "HR Specialist"])


print(df[filtering])

            Name           Position         Office  Age  Salary
0       John Doe  Software Engineer       New York   28   95000
1     Jane Smith     Data Scientist  San Francisco   34  125000
3   Linda Taylor      HR Specialist         London   32   85000
4    Emily Clark     Data Scientist  San Francisco   30  115000
5    David Smith  Software Engineer       New York   38  105000
6  Michael Brown      HR Specialist         London   41   90000
