# Pandas는 무엇인가요?
## 데이터 분석 및 가공에 사용되는 파이썬 라이브러리

In [3]:
import pandas as pd

pd.__version__

'1.5.3'

In [3]:
data_frame = pd.read_csv('data/friend_list.csv') #데이터프레임 형식으로 가져옴
data_frame

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


## 데이터프레임은 무엇인가요?
- 가로축과 세로축이 있는 엑셀과 유사한 데이터 구조
- 가로축은 로우(행), 세로축은 컬럼(열)

In [4]:
# 데이터프레임이 가지고 있는 함수의 예제
data_frame.head()

#default로 5개의 데이터를 보여줌

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


# 시리즈(series)는 무엇인가요?
- 데이터프레임의 컬럼(열)은 모두 시리즈임
- 위의 예제는 3개의 시리즈로 구성된 데이터프레임

In [6]:
print(type(data_frame))
print(type(data_frame.job))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [12]:
# 시리즈의 제공 메소드들

data_frame.job = data_frame.job.str.upper()
data_frame.head()

Unnamed: 0,name,age,job
0,John,20,STUDENT
1,Jenny,30,DEVELOPER
2,Nate,30,TEACHER
3,Julia,40,DENTIST
4,Brian,45,MANAGER


- 시리즈는 단순히 파이썬 리스트를 간직한 오브젝트
- 리스트를 파라미터로 주면 바로 시리즈가 생성
- 시리즈는 데이터 가공 및 분석이 파이썬 리스트보다 훨씬 쉽다

In [15]:
s1 = pd.core.series.Series(['one','two','three'])
s2 = pd.core.series.Series([1,2,3])

pd.DataFrame(data=dict(word=s1, num=s2))

Unnamed: 0,word,num
0,one,1
1,two,2
2,three,3


In [17]:
#구분자에 의해 컬럼이 구분되어 있는 데이터는 모두 지원

df=pd.read_csv('data/friend_list.txt')
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [2]:
df= pd.read_csv('data/friend_list_tab.txt', delimiter='\t')
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [4]:
df= pd.read_csv('data/friend_list_no_head.csv', header=None)
df.head()

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [6]:
df.columns = ['name','age','job']
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [7]:
df= pd.read_csv('data/friend_list_no_head.csv', header=None, names=['name','age','job'])
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


## 데이터프레임을 파이썬 코드로 생성하기
#### 딕셔너리로 데이터프레임 생성하기

In [8]:
friend_dict_list = [{'name' : 'John', 'age':'20', 'job':'student'},
                       {'name' : 'Jenny', 'age':'30', 'job':'developer'},
                       {'name' : 'Nate', 'age':'30', 'job':'teacher'}]

df = pd.DataFrame(friend_dict_list)
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [9]:
df[['name', 'job', 'age']]
df.head()

#저장의 순서를 유지하려고 함

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


### OrderedDict로 데이터프레임 생성하기
* 파이썬 3.6 이전 버전까지는 딕셔너리 데이터 저장 순서를 보장하지 않음
* OrderedDict 자료구조로 데이터프레임을 생성하면, 컬럼의 순서가 뒤바뀌지 않음
* 데이터의 저장 순서대로 읽어오도록 보장

In [10]:
from collections import OrderedDict

In [11]:
#튜플의 형태로
friend_ordered_dict = OrderedDict(
                                     [('name',['John','Jenny','Nate']),
                                      ('age',[20,30,30]),
                                      ('job',['student','developer','teacher']) ])

df = pd.DataFrame.from_dict(friend_ordered_dict) #from_dict()는 dict객체로부터 DataFrame 객체로 변환하는 메소드


In [12]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


#### list로 딕셔너리로 데이터프레임 생성하기

In [13]:
friend_list = [ ['John', 20, 'student'],
                  ['Jenny', 30, 'developer'],
                  ['Nate', 20, 'teacher'] ]

column_name = ['name', 'age', 'job']

df = pd.DataFrame.from_records(friend_list, columns=column_name)

In [14]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,20,teacher


In [16]:
#딕셔너리 구조, 키-밸류
friend_list = {
                'name' : ['John','Jenny','Nate'],
                'age' : [20,30,30],
                'job' : ['student','developer','teacher']
                }

df = pd.DataFrame.from_dict(friend_list)

In [17]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


### 파일로 데이터프레임을 저장하기

In [23]:
# 파일로 저장
df.to_csv('friend_list_from_df.csv')

In [21]:
# 헤더가 없는 데이터프레임
friend_list = [ ['John', 20, 'student'],
                  ['Jenny', 30, 'developer'],
                  ['Nate', 20, 'teacher'] ]

df = pd.DataFrame.from_records(friend_list)
df.head()

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,20,teacher


In [24]:
df.to_csv('friend_list_from_df1.csv')

In [25]:
# 사실 파일의 확장자명은 원하는대로 지정해도 무관
df.to_csv('friend_list_from_df.txt')

In [26]:
# 파라미터의 default값
df.to_csv('friend_list_from_df.txt', header=True, index=True)

In [27]:
df.to_csv('friend_list_from_df_no.txt', header=False, index=False)

In [28]:
df.to_csv('friend_list_from_df_11.txt', header=['name','age','job'])

In [31]:
# None 값이 있는 데이터프레임

friend_list = {
                'name' : ['John','Jenny','Nate'],
                'age' : [20,None,30],
                'job' : ['student','developer','teacher']
                }

df = pd.DataFrame.from_dict(friend_list)

In [32]:
df.head()

Unnamed: 0,name,age,job
0,John,20.0,student
1,Jenny,,developer
2,Nate,30.0,teacher


In [33]:
df.to_csv('friend_list_from_df_none.csv')

In [34]:
#na_rep을 사용하게 되면 None을 원하는 값으로 쉽게 변경 가능
df.to_csv('friend_list_from_df.csv', na_rep='-')

### 데이터프레임 접근 방법
*  row(행) 선택하기

In [35]:
friend_list = {
                'name' : ['John','Jenny','Nate'],
                'age' : [20,30,30],
                'job' : ['student','developer','teacher']
                }
print(type(friend_list))
df = pd.DataFrame.from_dict(friend_list)

<class 'dict'>


In [36]:
# row의 index를 사용한 접근
df[1:3] 

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [37]:
# 순차적이지 않은 행을 선택

#list 형으로 접근해야 함
df.loc[[0,2]]

#df.loc[0,2] --error

Unnamed: 0,name,age,job
0,John,20,student
2,Nate,30,teacher


In [38]:
# 주의
df.loc[[0:2]]

#invalid syntax
#loc 는 슬라이싱을 지원하지 않음

SyntaxError: invalid syntax (352345048.py, line 2)

## 컬럼값에 따른 로우 선택하기

In [39]:
# 특정 컬럼값을 충족하는 행 선택
df.head() 

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [40]:
df_filtered = df[df.age > 25]
df_filtered

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [41]:
df_filtered = df.query('age>25')
df_filtered

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [43]:
df_filtered = df[(df.age >25) & (df.name=='Nate')]
df_filtered

Unnamed: 0,name,age,job
2,Nate,30,teacher


### 컬럼 필터하기
* 인덱스로 필터하기

In [44]:
friend_list = [ ['John', 20, 'student'],
                  ['Jenny', 30, 'developer'],
                  ['Nate', 20, 'teacher'] ]

df = pd.DataFrame.from_records(friend_list)
df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,20,teacher


In [50]:
# 모든 행(row)을 보여주되, 컬럼은 0부터 1까지만 출력
df_filtered = df.iloc[:,0:2]
df_filtered

# iloc는 행렬접근

# 1차원(벡터) - 시리즈, 2차원-데이터프레임
# loc와 구분지어야 함. loc는 행의 접근

Unnamed: 0,0,1
0,John,20
1,Jenny,30
2,Nate,20


In [53]:
# 모든 행(row)을 보여주되, 컬럼은 0과 2만 출력
df_filtered = df.iloc[:,[0,2]]   #하나의 데이터 단위인 list로 전달해줘야 함
df_filtered

Unnamed: 0,0,2
0,John,student
1,Jenny,developer
2,Nate,teacher


* 컬럼 이름으로 필터하기

In [56]:
df = pd.read_csv('data/friend_list_no_head.csv', header=None, names=['name','age','job'])
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [57]:
df_filtered = df[['name','age']]
df_filtered

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [58]:
df_filtered = df.filter(items=['age','job'])
df_filtered

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher
3,40,dentist
4,45,manager
5,25,intern


In [59]:
# 원하는 글자를 가진 행(row)을 선택
df.filter(like='a', axis=1) #열 중심으로 a가 있는 항목

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


## row 드롭하기
* row index로 행을 드롭할 수 있음

In [60]:
friend_dict_list = [{'age':20, 'job':'student'},
                       {'age':30, 'job':'developer'},
                       {'age':30, 'job':'teacher'} ]

df = pd.DataFrame(friend_dict_list, index=['John','Jenny','Nate'])

In [61]:
df

Unnamed: 0,age,job
John,20,student
Jenny,30,developer
Nate,30,teacher


*drop된 결과는 데이터프레임에 저장되지 않는다. 저장하고 싶으면, 결과를 데이터프레임에 따로 저장해야 됨

In [62]:
df.drop(['John','Nate'])

Unnamed: 0,age,job
Jenny,30,developer


In [63]:
df

Unnamed: 0,age,job
John,20,student
Jenny,30,developer
Nate,30,teacher


In [64]:
# 드롭된 결과를 데이터프레임에 저장
df = df.drop(['John','Nate'])  #내 자신의 변수에 저장

In [65]:
df

Unnamed: 0,age,job
Jenny,30,developer


In [66]:
friend_dict_list = [{'age':20, 'job':'student'},
                       {'age':30, 'job':'developer'},
                       {'age':30, 'job':'teacher'} ]

df = pd.DataFrame(friend_dict_list, index=['John','Jenny','Nate'])

In [67]:
# drop된 결과를 데이터프레임에 저장
df.drop(['John','Nate'], inplace=True)
df

Unnamed: 0,age,job
Jenny,30,developer


### row index로 drop하기

In [68]:
friend_dict_list = [{'name' : 'John', 'age':'20', 'job':'student'},
                       {'name' : 'Jenny', 'age':'30', 'job':'developer'},
                       {'name' : 'Nate', 'age':'30', 'job':'teacher'}]

df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [72]:
df = df.drop(df.index[[0,2]])
df

Unnamed: 0,name,age,job
1,Jenny,30,developer


### 컬럼값으로 row drop하기

In [73]:
friend_dict_list = [{'name' : 'John', 'age':'20', 'job':'student'},
                       {'name' : 'Jenny', 'age':'30', 'job':'developer'},
                       {'name' : 'Nate', 'age':'30', 'job':'teacher'}]

df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [76]:
df[df.age != '30']

Unnamed: 0,name,age,job
0,John,20,student


### 컬럼 drop하기

In [77]:
df.drop('age', axis=1)

Unnamed: 0,name,job
0,John,student
1,Jenny,developer
2,Nate,teacher


### 컬럼 추가 또는 변경하기

In [78]:
#컬럼 추가 또는 변경하기

df['salary'] =0
df

Unnamed: 0,name,age,job,salary
0,John,20,student,0
1,Jenny,30,developer,0
2,Nate,30,teacher,0


* 넘파이를 사용해, 한줄에 새로운 컬럼값을 생성할 수도 있음

In [80]:
import numpy as np

df['salary'] = np.where(df['job'] != 'student', 'yes', 'no')
df

Unnamed: 0,name,age,job,salary
0,John,20,student,no
1,Jenny,30,developer,yes
2,Nate,30,teacher,yes


In [83]:
friend_dict_list = [{'name':'John', 'midterm':95, 'final':85},
                       {'name':'Jenny', 'midterm':85, 'final':80},
                       {'name':'Nate', 'midterm':10, 'final':30}]
df = pd.DataFrame(friend_dict_list, columns=['name','midterm','final'])
df

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,10,30


In [84]:
df['total'] = df['midterm'] + df['final']

In [85]:
df

Unnamed: 0,name,midterm,final,total
0,John,95,85,180
1,Jenny,85,80,165
2,Nate,10,30,40


In [86]:
df['average'] = df['total'] /2
df

Unnamed: 0,name,midterm,final,total,average
0,John,95,85,180,90.0
1,Jenny,85,80,165,82.5
2,Nate,10,30,40,20.0


In [87]:
# 리스트에 조건별 값을 담아서, 새로운 컬럼으로 추가
grades = []

for row in df['average']:
    if row>=90:
        grades.append('A')
    elif row >=80:
        grades.append('B')
    elif row >=70:
        grades.append('C')
    elif row >=60:
        grades.append('D')
    else :
        grades.append('F')
        
df['grade'] = grades
df

Unnamed: 0,name,midterm,final,total,average,grade
0,John,95,85,180,90.0,A
1,Jenny,85,80,165,82.5,B
2,Nate,10,30,40,20.0,F


### apply() 함수 사용 예제
* apply()를 사용하면, 깔끔하게 컬럼의 값을 변경하는 코드를 구현할 수 있음

In [88]:
def pass_or_fail(row):
    if row !='F' : 
        return 'Pass'
    else:
        return 'Fail'

In [91]:
df.grade = df.grade.apply(pass_or_fail) #함수만 전달
df

Unnamed: 0,name,midterm,final,total,average,grade
0,John,95,85,180,90.0,Pass
1,Jenny,85,80,165,82.5,Pass
2,Nate,10,30,40,20.0,Fail


In [92]:
# apply() 사용해서 연월일의 정보에서 연도만 추출

data_list = [{'yyyy-mm-dd':'2000-06-27'},
               {'yyyy-mm-dd':'2002-09-24'},
               {'yyyy-mm-dd':'2005-12-20'} ]

df = pd.DataFrame(data_list, columns=['yyyy-mm-dd'])
df

Unnamed: 0,yyyy-mm-dd
0,2000-06-27
1,2002-09-24
2,2005-12-20


In [93]:
def extract_year(row) :
    return row.split('-')[0]

In [94]:
df['year'] = df['yyyy-mm-dd'].apply(extract_year)
df

Unnamed: 0,yyyy-mm-dd,year
0,2000-06-27,2000
1,2002-09-24,2002
2,2005-12-20,2005


### apply() 메소드에 파라미터 전달하기
* 키워드 파라미터를 사용하면, apply()가 적용된 함수에 파라미터를 전달할 수 있음

In [96]:
def extract_year(year, current_year):
    return current_year - int(year)

In [97]:
df['age'] = df['year'].apply(extract_year, current_year=2023)
df

Unnamed: 0,yyyy-mm-dd,year,age
0,2000-06-27,2000,23
1,2002-09-24,2002,21
2,2005-12-20,2005,18


In [98]:
def get_introduce(age,prefix,suffix):
    return prefix + str(age) + suffix

In [99]:
df['introduce'] = df['age'].apply(get_introduce, prefix="I am", suffix=" years old")
df

Unnamed: 0,yyyy-mm-dd,year,age,introduce
0,2000-06-27,2000,23,I am23 years old
1,2002-09-24,2002,21,I am21 years old
2,2005-12-20,2005,18,I am18 years old


In [102]:
# 여러 개의 컬럼을 동시에 전달
def get_introduce2(row) : 
    return "I was born in " + str(row.year) + " my age is " + str(row.age)   #파이썬은 들여쓰기 자체가 문법이기 때문에

    # return "I was born in " + str(row.year) + " my age is" \
    #          + str(row.age)

In [103]:
df.introduce = df.apply(get_introduce2, axis=1) #모든 컬럼 항목을 전달
df

Unnamed: 0,yyyy-mm-dd,year,age,introduce
0,2000-06-27,2000,23,I was born in 2000 my age is 23
1,2002-09-24,2002,21,I was born in 2002 my age is 21
2,2005-12-20,2005,18,I was born in 2005 my age is 18


### map() 함수로 컬럼 추가 및 변경하기
* 파라미터로 함수를 전달하면 apply 함수와 동일하게 컬럼값을 추가 및 변경할 수 있음

In [104]:
def extract_year(row) : 
    return row.split('-')[0]

data_list = [{'yyyy-mm-dd':'2000-06-27'},
               {'yyyy-mm-dd':'2002-09-24'},
               {'yyyy-mm-dd':'2005-12-20'} ]

df = pd.DataFrame(data_list, columns=['yyyy-mm-dd'])
df

Unnamed: 0,yyyy-mm-dd
0,2000-06-27
1,2002-09-24
2,2005-12-20


In [105]:
df['year'] = df['yyyy-mm-dd'].map(extract_year)
df

Unnamed: 0,yyyy-mm-dd,year
0,2000-06-27,2000
1,2002-09-24,2002
2,2005-12-20,2005


* 파라미터로 딕셔너리를 전달하면 컬럼값을 쉽게 원하는 값으로 변경 가능함
* 기존의 컬럼값은 딕셔너리의 key로 사용되고, 해당되는 value의 값으로 컬럼값이 변경되게 됨

In [107]:
job_list = [{'age':20, 'job':'student'},
              {'age':30, 'job':'developer'},
              {'age':40, 'job':'teacher'}]

df = pd.DataFrame(job_list)
df

Unnamed: 0,age,job
0,20,student
1,30,developer
2,40,teacher


In [108]:
df.job = df.job.map({"student" : 1, "developer":2, "teacher":3})
df

Unnamed: 0,age,job
0,20,1
1,30,2
2,40,3


### applymap() 함수
* 데이터프레임 전체의 각각의 값을 한번에 변경시킬 때 사용하면 좋다

In [109]:
x_y = [ {'x' : 5.5, 'y':-5.6},
          {'x' : -5.2, 'y':5.5},
          {'x' : -1.6, 'y':-4.5}]

df = pd.DataFrame(x_y)
df

Unnamed: 0,x,y
0,5.5,-5.6
1,-5.2,5.5
2,-1.6,-4.5


In [111]:
df = df.applymap(np.around)
df

Unnamed: 0,x,y
0,6.0,-6.0
1,-5.0,6.0
2,-2.0,-4.0


### 데이터프레임에 row(행)  추가하기

In [119]:
friend_dict_list = [{'name':'John', 'midterm':95, 'final':85},
                       {'name':'Jenny', 'midterm':85, 'final':80},
                       {'name':'Nate', 'midterm':10, 'final':30}]
df = pd.DataFrame(friend_dict_list, columns=['name','midterm','final'])
df

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,10,30


In [120]:
df2 = pd.DataFrame([['Ben',50,50]], columns=['name','midterm','final']) #2차원으로 넣어야 하는 것을 1차원의 것을 넣으려고 함. 

#row 데이터 삽입시 시리즈 맞지 않음

df2

Unnamed: 0,name,midterm,final
0,Ben,50,50


In [121]:
df.append(df2, ignore_index=True)

#ignore_index=False : 기존의 인덱스에 추가되는 것이 아닌 index값 무시하고 삽입
#ignore_index=True : 기존의 인덱스 뒤에 삽입

  df.append(df2, ignore_index=True)


Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,10,30
3,Ben,50,50


### groupby() 함수
* 데이터에서 정보를 취하기위해서 그룹별로 묶는 방법에 대해 알아보자

In [4]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"}
         ]

df = pd.DataFrame(student_list, columns=['name','major','sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [123]:
groupby_major = df.groupby('major')
groupby_major

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001AD8FC29970>

In [124]:
groupby_major.groups

{'Computer Science': [0, 1, 6, 7], 'Economics': [4, 5, 9], 'Physics': [2], 'Psychology': [3, 8, 10]}

In [125]:
for name, group in groupby_major : 
    print(name + " : " +str(len(group)))
    print(group)
    print()

Computer Science : 4
       name             major     sex
0      John  Computer Science    male
1      Nate  Computer Science    male
6  Jeniffer  Computer Science  female
7    Edward  Computer Science    male

Economics : 3
    name      major     sex
4  Janny  Economics  female
5   Yuna  Economics  female
9  Wendy  Economics  female

Physics : 1
      name    major   sex
2  Abraham  Physics  male

Psychology : 3
     name       major     sex
3   Brian  Psychology    male
8    Zara  Psychology  female
10   Sera  Psychology  female



In [126]:
# 그룹 객체를 다시 데이터프레임으로 생성하는 예제
df_major_cnt = pd.DataFrame({'count' : groupby_major.size()}).reset_index()
df_major_cnt

Unnamed: 0,major,count
0,Computer Science,4
1,Economics,3
2,Physics,1
3,Psychology,3


In [5]:
groupby_sex = df.groupby('sex')
groupby_sex

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017948E96FD0>

In [6]:
groupby_sex.groups

{'female': [4, 5, 6, 8, 9, 10], 'male': [0, 1, 2, 3, 7]}

In [8]:
for name, sex in groupby_sex :
    print(name+ ': '+str(len(sex)))
    print(sex)
    print()

female: 6
        name             major     sex
4      Janny         Economics  female
5       Yuna         Economics  female
6   Jeniffer  Computer Science  female
8       Zara        Psychology  female
9      Wendy         Economics  female
10      Sera        Psychology  female

male: 5
      name             major   sex
0     John  Computer Science  male
1     Nate  Computer Science  male
2  Abraham           Physics  male
3    Brian        Psychology  male
7   Edward  Computer Science  male



In [12]:
df_sex_cnt = pd.DataFrame({'count' : groupby_sex.size()}).reset_index()
df_sex_cnt

Unnamed: 0,sex,count
0,female,6
1,male,5


### 중복 데이터 drop 하기

In [13]:
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [14]:
# 중복된 데이터 확인하기

df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
dtype: bool

In [15]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"},
                {'name': 'John', 'major': "Computer Science", 'sex': "male"}
         ]

df = pd.DataFrame(student_list, columns=['name','major','sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [16]:
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11     True
dtype: bool

In [19]:
# 중복 데이터를 삭제
df = df.drop_duplicates()

In [20]:
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [25]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': None, 'sex': "female"},
                {'name': 'John', 'major': "Computer Science", 'sex': None}
         ]

df = pd.DataFrame(student_list, columns=['name','major','sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [23]:
df.duplicated(['name'])  #이름만 인덱스로 뽑아서 검색 가능

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11     True
dtype: bool

In [30]:
# keep값을 'first' 또는 'last'라고 값을 줘서 중복된 값 중, 어느 값을 유지할지 결정
# defualt 값은 first

df.drop_duplicates(['name'], keep='first') #이름만 필터링해서 삭제도 가능

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [29]:
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [31]:
df.drop_duplicates(['name'], keep='last')

Unnamed: 0,name,major,sex
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female
10,Sera,,female


In [36]:
# None 처리하기

school_id_list =  [{'name' : 'John', 'age':20, 'job':'student'},
                       {'name' : 'Jenny', 'age':30, 'job':'developer'},
                       {'name' : 'Nate', 'age':None, 'job':'teacher'}]

df = pd.DataFrame(school_id_list)
df

Unnamed: 0,name,age,job
0,John,20.0,student
1,Jenny,30.0,developer
2,Nate,,teacher


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
 1   age     3 non-null      object
 2   job     3 non-null      object
dtypes: object(3)
memory usage: 200.0+ bytes


In [37]:
df.isna()

Unnamed: 0,name,age,job
0,False,False,False
1,False,False,False
2,False,True,False


In [38]:
# 값을 무조건 삭제하기 보다는 0으로 채워주거나 평균값으로 채워서 조금 더 정확한 예측모델을 만들어주는 것이 좋음

### Null 또는 Nan 값 변경하기

In [39]:
# Null을 0으로 대체

tmp = df
tmp['age'] = tmp['age'].fillna(0)
tmp

Unnamed: 0,name,age,job
0,John,20.0,student
1,Jenny,30.0,developer
2,Nate,0.0,teacher


In [43]:
# 0으로 설정하기 보다는 선생님의 중간 나이, 학생의 중간 나이로 각각의 직업군에 맞게 Null값을 변경

school_id_list = [{'name' : 'John', 'job':'teacher', 'age':40}, 
                     {'name' : 'Nate', 'job':'teacher', 'age':35},
                     {'name' : 'Yuna', 'job':'teacher', 'age':37},
                     {'name' : 'Abraham', 'job':'student', 'age':10}, 
                     {'name' : 'Brian', 'job':'student', 'age':12},
                     {'name' : 'Jenny', 'job':'student', 'age':11},
                     {'name' : 'Nate', 'job':'teacher', 'age':None},
                     {'name' : 'John', 'job':'student', 'age':None}
                 ]

df = pd.DataFrame(school_id_list, columns=['name','job','age'])
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Yuna,teacher,37.0
3,Abraham,student,10.0
4,Brian,student,12.0
5,Jenny,student,11.0
6,Nate,teacher,
7,John,student,


In [49]:
df['age'].fillna(df.groupby('job')['age'].transform('median'), inplace=True)
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Yuna,teacher,37.0
3,Abraham,student,10.0
4,Brian,student,12.0
5,Jenny,student,11.0
6,Nate,teacher,37.0
7,John,student,11.0


### Unique
* 컬럼에 여러 값이 있을 때, 중복없이 어떤 값들이 있는지 확인하는 방법

In [50]:
job_list = [{'name': 'John', 'job': "teacher"},
                {'name': 'Nate', 'job': "teacher"},
                {'name': 'Fred', 'job': "teacher"},
                {'name': 'Abraham', 'job': "student"},
                {'name': 'Brian', 'job': "student"},
                {'name': 'Janny', 'job': "developer"},
                {'name': 'Nate', 'job': "teacher"},
                {'name': 'Obrian', 'job': "dentist"},
                {'name': 'Yuna', 'job': "teacher"},
                {'name': 'Rob', 'job': "lawyer"},
                {'name': 'Brian', 'job': "student"},
                {'name': 'Matt', 'job': "student"},
                {'name': 'Wendy', 'job': "banker"},
                {'name': 'Edward', 'job': "teacher"},
                {'name': 'Ian', 'job': "teacher"},
                {'name': 'Chris', 'job': "banker"},
                {'name': 'Philip', 'job': "lawyer"},
                {'name': 'Janny', 'job': "basketball player"},
                {'name': 'Gwen', 'job': "teacher"},
                {'name': 'Jessy', 'job': "student"}
            ]

df = pd.DataFrame(job_list, columns=['name','job'])
df

Unnamed: 0,name,job
0,John,teacher
1,Nate,teacher
2,Fred,teacher
3,Abraham,student
4,Brian,student
5,Janny,developer
6,Nate,teacher
7,Obrian,dentist
8,Yuna,teacher
9,Rob,lawyer


In [53]:
# 컬럼(Series)의 unique() 함수를 사용하여, 중복없이, 컬럼에 있는 모든 값들을 출력

print(df.job.unique())

['teacher' 'student' 'developer' 'dentist' 'lawyer' 'banker'
 'basketball player']


In [54]:
# 각 유니크한 값별로 몇개의 데이터가 속하는지 value_counts()함수로 확인
print(df.job.value_counts())

teacher              8
student              5
lawyer               2
banker               2
developer            1
dentist              1
basketball player    1
Name: job, dtype: int64


### 두 개의 데이터 프레임 합치기

In [56]:
l1 = [{'name':'John', 'job':'teacher'},
         {'name':'Nate', 'job':'student'},
         {'name':'Fred', 'job':'developer'}]


l2 = [{'name':'Ed', 'job':'dentist'},
         {'name':'Jake', 'job':'farmer'},
         {'name':'Ted', 'job':'designer'}]

df1 = pd.DataFrame(l1,columns=['name','job'])
df2 = pd.DataFrame(l2,columns=['name','job'])

In [57]:
# 두번째 데이터프레임을 첫번째 데이터프레임의 새로운 row(행)으로 합침

frame = [df1, df2]
result = pd.concat(frame, ignore_index=True)
result

Unnamed: 0,name,job
0,John,teacher
1,Nate,student
2,Fred,developer
3,Ed,dentist
4,Jake,farmer
5,Ted,designer


In [60]:
l1 = [{'name':'John', 'job':'teacher'},
         {'name':'Nate', 'job':'student'},
         {'name':'Fred', 'job':'developer'}]


l2 = [{'name':'Ed', 'job':'dentist'},
         {'name':'Jake', 'job':'farmer'},
         {'name':'Ted', 'job':'designer'}]

df1 = pd.DataFrame(l1,columns=['name','job'])
df2 = pd.DataFrame(l2,columns=['name','job'])

result = df1.append(df2,  ignore_index=True)
result

  result = df1.append(df2,  ignore_index=True)


Unnamed: 0,name,job
0,John,teacher
1,Nate,student
2,Fred,developer
3,Ed,dentist
4,Jake,farmer
5,Ted,designer


In [59]:
# 두번째 데이터프레임을 첫번째 데이터프레임의 새로운 컬럼(열)으로 합치기

l1 = [{'name':'John', 'job':'teacher'},
         {'name':'Nate', 'job':'student'},
         {'name':'Fred', 'job':'developer'}]


l2 = [{'age':25, 'country':'U.S'},
       {'age':30, 'country':'U.K'},
       {'age':35, 'country':'Korea'}]

df1 = pd.DataFrame(l1,columns=['name','job'])
df2 = pd.DataFrame(l2,columns=['age','country'])


In [61]:
#행단위로 합칠 때
#pd.concat([df1,df2])

#열단위로 합칠때
pd.concat([df1,df2], axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3
0,John,teacher,Ed,dentist
1,Nate,student,Jake,farmer
2,Fred,developer,Ted,designer


### 두 개의 리스트를 묶어서 데이터프레임으로 생성하기

In [62]:
label = [1,2,3,4,5]
predication = [1,2,2,5,5]

comparison = pd.DataFrame({'label' : label, 'predication' : predication})   #값들을 딕셔너리로 전달
comparison

Unnamed: 0,label,predication
0,1,1
1,2,2
2,3,2
3,4,5
4,5,5
