In [1]:
import pandas as pd

In [2]:
friend_dict_list = [{'name': 'Jone', 'age': 15, 'job': 'student'},
         {'name': 'Jenny', 'age': 30, 'job': 'developer'},
         {'name': 'Nate', 'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(friend_dict_list, columns = ['name', 'age', 'job'])

In [3]:
df

Unnamed: 0,name,age,job
0,Jone,15,student
1,Jenny,30,developer
2,Nate,30,teacher


## Add Column / Update Column

### Add new column with default value

In [4]:
df['salary'] = 0

In [5]:
df

Unnamed: 0,name,age,job,salary
0,Jone,15,student,0
1,Jenny,30,developer,0
2,Nate,30,teacher,0


### one liner adding column by true or false condition

In [6]:
import numpy as np

In [7]:
df['salary'] = np.where(df['job'] != 'student', 'yes', 'no')

In [8]:
df

Unnamed: 0,name,age,job,salary
0,Jone,15,student,no
1,Jenny,30,developer,yes
2,Nate,30,teacher,yes


### column derived from adding two existing columns

In [9]:
friend_dict_list = [{'name': 'John', 'midterm': 95, 'final': 85},
         {'name': 'Jenny', 'midterm': 85, 'final': 80},
         {'name': 'Nate', 'midterm': 10, 'final': 30}]
df = pd.DataFrame(friend_dict_list, columns = ['name', 'midterm', 'final'])

In [10]:
df

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,10,30


In [11]:
df['total'] = df['midterm'] + df['final']

In [12]:
df

Unnamed: 0,name,midterm,final,total
0,John,95,85,180
1,Jenny,85,80,165
2,Nate,10,30,40


In [13]:
df['average'] = df['total'] / 2

In [14]:
df

Unnamed: 0,name,midterm,final,total,average
0,John,95,85,180,90.0
1,Jenny,85,80,165,82.5
2,Nate,10,30,40,20.0


### column by conditional condition

In [19]:
grades = []

for row in df['average']:
    if row >= 90:
        grades.append('A')
    elif row >= 80:
        grades.append('B')
    else:
        grades.append('F')

In [20]:
grades

['A', 'B', 'F']

In [21]:
df['grades'] = grades

In [22]:
df

Unnamed: 0,name,midterm,final,total,average,grades
0,John,95,85,180,90.0,A
1,Jenny,85,80,165,82.5,B
2,Nate,10,30,40,20.0,F


### how to use apply function

In [23]:
def pass_or_fail(row):
    if row != 'F':
        return 'Pass'
    else:
        return 'Fail'

In [24]:
df.grades = df.grades.apply(pass_or_fail)

각 row마다 함수의 인자로 넣어줘서 return value를 넣어준다

In [25]:
df

Unnamed: 0,name,midterm,final,total,average,grades
0,John,95,85,180,90.0,Pass
1,Jenny,85,80,165,82.5,Pass
2,Nate,10,30,40,20.0,Fail


### Feature extraction : info extraction using df.apply

In [26]:
date_list = [{'yyyy-mm-dd': '2000-06-27'},
         {'yyyy-mm-dd': '2002-09-24'},
         {'yyyy-mm-dd': '2005-12-20'}]
df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])

In [27]:
df

Unnamed: 0,yyyy-mm-dd
0,2000-06-27
1,2002-09-24
2,2005-12-20


In [28]:
def extract_year(row):
    return row.split('-')[0]

In [29]:
df['year'] = df['yyyy-mm-dd'].apply(extract_year)

In [30]:
df

Unnamed: 0,yyyy-mm-dd,year
0,2000-06-27,2000
1,2002-09-24,2002
2,2005-12-20,2005


### passing keyword parameter to apply function

In [31]:
def extract_year(year, current_year):
    return current_year - int(year)

In [36]:
df['age'] = df['year'].apply(extract_year, current_year= 2018)

In [37]:
df

Unnamed: 0,yyyy-mm-dd,year,age
0,2000-06-27,2000,18
1,2002-09-24,2002,16
2,2005-12-20,2005,13


#### how to use map function

In [40]:
def extract_year(row):
    return row.split('-')[0]

date_list = [{'yyyy-mm-dd': '2000-06-27'},
         {'yyyy-mm-dd': '2002-09-24'},
         {'yyyy-mm-dd': '2005-12-20'}]
df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])

In [39]:
df

Unnamed: 0,yyyy-mm-dd
0,2000-06-27
1,2002-09-24
2,2005-12-20


In [41]:
df['year'] = df['yyyy-mm-dd'].map(extract_year)

In [42]:
df

Unnamed: 0,yyyy-mm-dd,year
0,2000-06-27,2000
1,2002-09-24,2002
2,2005-12-20,2005


In [43]:
job_list = [{'age': 20, 'job': 'student'},
         {'age': 30, 'job': 'developer'},
         {'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(job_list)

In [44]:
df

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher


In [45]:
df.job = df.job.map({"student":1,"developer":2,"teacher":3})

In [46]:
df

Unnamed: 0,age,job
0,20,1
1,30,2
2,30,3


## Applymap
- 한번에 Dataframe의 모든 요소를 변경할 때

In [47]:
x_y = [{'x': 5.5, 'y': -5.6},
         {'x': -5.2, 'y': 5.5},
         {'x': -1.6, 'y': -4.5}]
df = pd.DataFrame(x_y)

In [48]:
df

Unnamed: 0,x,y
0,5.5,-5.6
1,-5.2,5.5
2,-1.6,-4.5


In [49]:
df = df.applymap(np.around)

In [50]:
df

Unnamed: 0,x,y
0,6.0,-6.0
1,-5.0,6.0
2,-2.0,-4.0


## Add row

In [59]:
friend_dict_list = [{'name': 'John', 'midterm': 95, 'final': 85},
         {'name': 'Jenny', 'midterm': 85, 'final': 80},
         {'name': 'Nate', 'midterm': 10, 'final': 30}]
df = pd.DataFrame(friend_dict_list, columns = ['name', 'midterm', 'final'])

In [52]:
df

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,10,30


In [54]:
df2 = pd.DataFrame([
    ['Ben', 50, 50]
], columns= ['name', 'midterm', 'final'])

In [55]:
df2

Unnamed: 0,name,midterm,final
0,Ben,50,50


In [60]:
df.append(df2, ignore_index= True)

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,10,30
3,Ben,50,50
