## 2. DataFrame
2차원 데이터(Series들의 모음)

### Data 준비
사전(dict) 자료구조를 통해 생성

In [None]:
data = {
    'name': ['John', 'Jane', 'Doe'],
    'age': [25, 30, 22],
    'city': ['New York', 'Los Angeles', 'Chicago']
}
data['age'] # Accessing the 'age' key in the dictionary

[25, 30, 22]

### DataFrame 객체 생성성

In [5]:
import pandas as pd
df = pd.DataFrame(data) # Creating a DataFrame from the dictionary
df

Unnamed: 0,name,age,city
0,John,25,New York
1,Jane,30,Los Angeles
2,Doe,22,Chicago


### 데이터 접근

In [None]:
df['name'] # Accessing the 'name' column in the DataFrame

0    John
1    Jane
2     Doe
Name: name, dtype: object

In [None]:
df[['name', 'age']] # Accessing multiple columns in the DataFrame

Unnamed: 0,name,age
0,John,25
1,Jane,30
2,Doe,22


### 데이터프레임 객체 생성(index 지정)

In [8]:
df = pd.DataFrame(data, index=['a', 'b', 'c']) # Creating a DataFrame with custom indices
df

Unnamed: 0,name,age,city
a,John,25,New York
b,Jane,30,Los Angeles
c,Doe,22,Chicago


## DataFrame 객체 생성(Columns 지정)
data 중에서 원하는 column만 선택하거나, 순서 변경 가능

In [None]:
df = pd.DataFrame(data, columns=['age', 'name']) # Creating a DataFrame with specific columns. columns 순서에 의해 순서가 바뀜
df

Unnamed: 0,age,name
0,25,John
1,30,Jane
2,22,Doe


## 3. Index
데이터에 접근할 수 있는 주소 값

In [12]:
df = pd.DataFrame(data, index=['a', 'b', 'c']) # Creating a DataFrame with custom indices
df.index # Accessing the index of the DataFrame

Index(['a', 'b', 'c'], dtype='object')

### Index 이름 설정

In [13]:
df.index.name = 'custom_index' # Setting a name for the index
df

Unnamed: 0_level_0,name,age,city
custom_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,John,25,New York
b,Jane,30,Los Angeles
c,Doe,22,Chicago


### Index 초기화화

In [None]:
df.reset_index() # 기존에 쓰고 있던 인덱스는 일반 컬럼이 되버림

Unnamed: 0,custom_index,name,age,city
0,a,John,25,New York
1,b,Jane,30,Los Angeles
2,c,Doe,22,Chicago


In [18]:
df.reset_index(drop=True, inplace=True) # 원래 쓰던 'custom_index' 인덱스 삭제 후 실제 데이터로 바로 반영
df

Unnamed: 0,name,age,city
0,John,25,New York
1,Jane,30,Los Angeles
2,Doe,22,Chicago


### Index 설정
지정한 column으로 index 설정

In [19]:
df.set_index('name', inplace=True) # 'name' 컬럼을 인덱스로 설정
df

Unnamed: 0_level_0,age,city
name,Unnamed: 1_level_1,Unnamed: 2_level_1
John,25,New York
Jane,30,Los Angeles
Doe,22,Chicago


### Index 정렬
index를 기준으로 오름차순, 내림차순 정렬

In [21]:
df.sort_index() # 인덱스로 오름차순 정렬

Unnamed: 0_level_0,age,city
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Doe,22,Chicago
Jane,30,Los Angeles
John,25,New York


In [22]:
df.sort_index(ascending=False) # 인덱스로 내림차순 정렬

Unnamed: 0_level_0,age,city
name,Unnamed: 1_level_1,Unnamed: 2_level_1
John,25,New York
Jane,30,Los Angeles
Doe,22,Chicago


## 4. 파일 저장 및 열기
DataFrame 객체를 excel, csv, txt 등 형태의 파일로 저장 및 열기

In [23]:
data = {
    'name': ['John', 'Jane', 'Doe', 'Alice', 'Bob', 'Charlie', 'Eve'],
    'age': [25, 30, 22, 28, 35, 40, 29],
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio']
}
df = pd.DataFrame(data, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
df.index.name = 'custom_index'
df

Unnamed: 0_level_0,name,age,city
custom_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,John,25,New York
b,Jane,30,Los Angeles
c,Doe,22,Chicago
d,Alice,28,Houston
e,Bob,35,Phoenix
f,Charlie,40,Philadelphia
g,Eve,29,San Antonio


### 저장하기
- CSV 파일로 저장
- TXT 파일로 저장
- Excel 파일로 저장

In [None]:
df.to_csv('data.csv', encoding='utf-8-sig', index=False)
# The above line saves the DataFrame to a CSV file without the index and with UTF-8 encoding
df.to_csv('data.txt', sep='\t', encoding='utf-8-sig')
# The above line saves the DataFrame to a tab-separated text file with UTF-8 encoding
df.to_excel('data.xlsx')
# The above line saves the DataFrame to an Excel file

### 열기
- CSV 파일 열기

In [29]:
df = pd.read_csv('data.csv')  # Reading the CSV file back into a DataFrame
df

Unnamed: 0,name,age,city
0,John,25,New York
1,Jane,30,Los Angeles
2,Doe,22,Chicago
3,Alice,28,Houston
4,Bob,35,Phoenix
5,Charlie,40,Philadelphia
6,Eve,29,San Antonio


In [30]:
df = pd.read_csv('data.csv', skiprows=1)  # Skipping the first row while reading the CSV file
df  # Displaying the DataFrame after skipping the first row

Unnamed: 0,John,25,New York
0,Jane,30,Los Angeles
1,Doe,22,Chicago
2,Alice,28,Houston
3,Bob,35,Phoenix
4,Charlie,40,Philadelphia
5,Eve,29,San Antonio


In [31]:
df = pd.read_csv('data.csv', skiprows=[0, 2])  # Skipping specific rows while reading the CSV file
df  # Displaying the DataFrame after skipping specific rows

Unnamed: 0,John,25,New York
0,Doe,22,Chicago
1,Alice,28,Houston
2,Bob,35,Phoenix
3,Charlie,40,Philadelphia
4,Eve,29,San Antonio


In [32]:
df = pd.read_csv('data.csv', nrows=3)  # Reading only the first 3 rows of the CSV file
df  # Displaying the DataFrame after reading only the first 3 rows

Unnamed: 0,name,age,city
0,John,25,New York
1,Jane,30,Los Angeles
2,Doe,22,Chicago


In [None]:
df = pd.read_csv('data.csv', skiprows=2, nrows=3)  # Skipping the first 2 rows and reading the next 3 rows
df  # Displaying the DataFrame after skipping and reading specific rows

Unnamed: 0,Jane,30,Los Angeles
0,Doe,22,Chicago
1,Alice,28,Houston
2,Bob,35,Phoenix


- TXT 파일 열기기

In [34]:
df = pd.read_csv('data.txt', sep='\t')  # Reading a tab-separated text file into a DataFrame
df  # Displaying the DataFrame after reading the tab-separated text file

Unnamed: 0,custom_index,name,age,city
0,a,John,25,New York
1,b,Jane,30,Los Angeles
2,c,Doe,22,Chicago
3,d,Alice,28,Houston
4,e,Bob,35,Phoenix
5,f,Charlie,40,Philadelphia
6,g,Eve,29,San Antonio


In [35]:
df = pd.read_csv('data.txt', sep='\t', index_col='custom_index')  # 'custom_index' 컬럼을 인덱스로 설정하여 읽기
df  # Displaying the DataFrame after setting 'custom_index' as the index

Unnamed: 0_level_0,name,age,city
custom_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,John,25,New York
b,Jane,30,Los Angeles
c,Doe,22,Chicago
d,Alice,28,Houston
e,Bob,35,Phoenix
f,Charlie,40,Philadelphia
g,Eve,29,San Antonio


- Excel 파일 열기기

In [36]:
df = pd.read_excel('data.xlsx')  # Reading the Excel file into a DataFrame
df  # Displaying the DataFrame after reading the Excel file

Unnamed: 0,custom_index,name,age,city
0,a,John,25,New York
1,b,Jane,30,Los Angeles
2,c,Doe,22,Chicago
3,d,Alice,28,Houston
4,e,Bob,35,Phoenix
5,f,Charlie,40,Philadelphia
6,g,Eve,29,San Antonio


In [37]:
df = pd.read_excel('data.xlsx', index_col='custom_index')  # 'custom_index' 컬럼을 인덱스로 설정하여 읽기
df  # Displaying the DataFrame after setting 'custom_index' as the index

Unnamed: 0_level_0,name,age,city
custom_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,John,25,New York
b,Jane,30,Los Angeles
c,Doe,22,Chicago
d,Alice,28,Houston
e,Bob,35,Phoenix
f,Charlie,40,Philadelphia
g,Eve,29,San Antonio
