In [None]:
import pandas as pd

##### Series

In [1]:
# 0~9까지 랜덤한 5개의 데이터를 Series 생성
data = pd.Series(np.random.randint(10, size=(5)))
data

0    6
1    1
2    5
3    4
4    9
dtype: int64

In [2]:
# index 설정
data = pd.Series(np.random.randint(10, size=5), index=["A","B","C","D","E"])
data

A    4
B    3
C    0
D    1
E    5
dtype: int64

In [3]:
data.index, data.values

(Index(['A', 'B', 'C', 'D', 'E'], dtype='object'), array([4, 3, 0, 1, 5]))

In [4]:
# value 값 확인
data.A, data.D

(4, 1)

In [5]:
data = pd.Series(np.random.randint(10, size=5), index=["1","2","3","4","5"])
data

1    2
2    5
3    0
4    3
5    3
dtype: int64

In [6]:
data.1

SyntaxError: invalid syntax (<ipython-input-6-d42d59a15bba>, line 1)

In [7]:
data

1    2
2    5
3    0
4    3
5    3
dtype: int64

In [8]:
# series에 이름과 인덱스에 이름을 설정할수 있습니다.
data.name = "random_number"
data.index.name = "index_number"
data

index_number
1    2
2    5
3    0
4    3
5    3
Name: random_number, dtype: int64

In [10]:
data = pd.Series(np.random.randint(10, size=5), index=["A","B","C","D","E"])
data

A    6
B    0
C    4
D    9
E    8
dtype: int64

In [11]:
data * 10

A    60
B     0
C    40
D    90
E    80
dtype: int64

In [14]:
data[["B","C","E"]]

B    0
C    4
E    8
dtype: int64

In [18]:
data[1::2]

B    0
D    9
dtype: int64

In [19]:
data[::-1]

E    8
D    9
C    4
B    0
A    6
dtype: int64

In [22]:
data > 5

A     True
B    False
C    False
D     True
E     True
dtype: bool

In [23]:
data[data > 5]

A    6
D    9
E    8
dtype: int64

In [25]:
# for문 사용 - list comprehention 으로도 사용이 가능
# [idx, val for idx, val in data.items() ]
for idx, val in data.items():
    print(idx, val)

A 6
B 0
C 4
D 9
E 8


In [30]:
# dictionary 데이터 타입의 데이터로 series 생성 가능
dic = {"D":3, "F":7, "E":5}
data2 = pd.Series(dic)
data2

D    3
E    5
F    7
dtype: int64

In [31]:
data

A    6
B    0
C    4
D    9
E    8
dtype: int64

In [32]:
data2

D    3
E    5
F    7
dtype: int64

In [33]:
result = data + data2
result

A     NaN
B     NaN
C     NaN
D    12.0
E    13.0
F     NaN
dtype: float64

In [36]:
# NaN 데이터 제거
print(result.notnull())
result[result.notnull()]

A    False
B    False
C    False
D     True
E     True
F    False
dtype: bool


D    12.0
E    13.0
dtype: float64

##### Dataframe
- row(index), value, column으로 이루져 있습니다.
- make
- insert
    - row
    - column
- append
- concat
- groupby, aggregate
- select
- merge

##### make

In [37]:
# 컬럼을 만들고 컬럼에 리스트 데이터를 추가해서 만드는 방법
df = pd.DataFrame(columns=["Email", "Name"])
df

Unnamed: 0,Email,Name


In [38]:
df["Name"] = ["fcamp", "dss"]
df["Email"] = ["fcamp@gmail.com", "dss@gmail.com"]
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [41]:
df["Name"]

0    fcamp
1      dss
Name: Name, dtype: object

In [42]:
df["Email"]

0    fcamp@gmail.com
1      dss@gmail.com
Name: Email, dtype: object

In [43]:
# 딕셔너리 데이터 타입을 Dataframe으로 만들기
name = ["fcamp", "dss"]
email = ["fcamp@gmail.com", "dss@gmail.com"]
dic = {"Name":name, "Email":email}
df = pd.DataFrame(dic)
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [44]:
dic

{'Email': ['fcamp@gmail.com', 'dss@gmail.com'], 'Name': ['fcamp', 'dss']}

In [46]:
# 인덱스를 추가해서 만들기
index_list = ["one", "two"]
df = pd.DataFrame(dic, index=index_list)
df

Unnamed: 0,Email,Name
one,fcamp@gmail.com,fcamp
two,dss@gmail.com,dss


In [49]:
df.index, df.columns, df.values

(Index(['one', 'two'], dtype='object'),
 Index(['Email', 'Name'], dtype='object'),
 array([['fcamp@gmail.com', 'fcamp'],
        ['dss@gmail.com', 'dss']], dtype=object))

##### Insert
- row
- column

In [50]:
# row
name = ["fcamp", "dss"]
email = ["fcamp@gmail.com", "dss@gmail.com"]
dic = {"Name":name, "Email":email}
df = pd.DataFrame(dic)
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [51]:
df.loc[0]

Email    fcamp@gmail.com
Name               fcamp
Name: 0, dtype: object

In [52]:
df.loc[1]

Email    dss@gmail.com
Name               dss
Name: 1, dtype: object

In [53]:
# loc 지정해서 데이터를 넣는 방법
df.loc[2] = {"Email":"data@gmail.com", "Name":"data"}
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss
2,data@gmail.com,data


In [56]:
# loc 이용해서 항상 가장 마지막에 넣는 방법
print(len(df))
df.loc[len(df)] = {"Email":"data2@gmail.com", "Name":"data2"}
df

4


Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss
2,data@gmail.com,data
3,data2@gmail.com,data2
4,data2@gmail.com,data2


In [60]:
# column
df["Address"] = ""
df

Unnamed: 0,Email,Name,Address
0,fcamp@gmail.com,fcamp,
1,dss@gmail.com,dss,
2,data@gmail.com,data,
3,data2@gmail.com,data2,
4,data2@gmail.com,data2,


In [63]:
df["Address"] = ["Seoul", "Busan", "Jeju", "Deagu", "Seoul"]
df

Unnamed: 0,Email,Name,Address
0,fcamp@gmail.com,fcamp,Seoul
1,dss@gmail.com,dss,Busan
2,data@gmail.com,data,Jeju
3,data2@gmail.com,data2,Deagu
4,data2@gmail.com,data2,Seoul


In [65]:
# apply
# 함수를 사용해서 함수의 리턴값이 데이터로 들어갑니다.
def count_char(name):
    return "{}({})".format(name, len(name))

df["Name_Count"] = df["Name"].apply(count_char)
df

Unnamed: 0,Email,Name,Address,Name_Count
0,fcamp@gmail.com,fcamp,Seoul,fcamp(5)
1,dss@gmail.com,dss,Busan,dss(3)
2,data@gmail.com,data,Jeju,data(4)
3,data2@gmail.com,data2,Deagu,data2(5)
4,data2@gmail.com,data2,Seoul,data2(5)


In [66]:
df["Address_Count"] = df["Address"].apply(lambda addr:"{}({})".format(addr, len(addr)))
df

Unnamed: 0,Email,Name,Address,Name_Count,Address_Count
0,fcamp@gmail.com,fcamp,Seoul,fcamp(5),Seoul(5)
1,dss@gmail.com,dss,Busan,dss(3),Busan(5)
2,data@gmail.com,data,Jeju,data(4),Jeju(4)
3,data2@gmail.com,data2,Deagu,data2(5),Deagu(5)
4,data2@gmail.com,data2,Seoul,data2(5),Seoul(5)


In [119]:
# append
# 사람으 이름과 나이가 들어간 데이터를 만듭니다.
import random, string

def get_name():
    names = ["Adam", "Alan", "Alex", "Alvin", "Andrew", "Anthony", "Arnold", "Jin", "Billy", "Anchal"]
    return random.choice(names)
    
get_name()

'Adam'

In [120]:
def get_age(start=20, end=40):
    return random.randint(start, end)

get_age()

32

In [121]:
# list
def make_data(rows=10):
    datas = []
    for _ in range(rows):
        data = {"Age":get_age(), "Name":get_name()}
        datas.append(data)
    return datas

make_data()

[{'Age': 37, 'Name': 'Arnold'},
 {'Age': 37, 'Name': 'Anchal'},
 {'Age': 31, 'Name': 'Adam'},
 {'Age': 35, 'Name': 'Billy'},
 {'Age': 21, 'Name': 'Anthony'},
 {'Age': 24, 'Name': 'Adam'},
 {'Age': 30, 'Name': 'Billy'},
 {'Age': 22, 'Name': 'Anchal'},
 {'Age': 29, 'Name': 'Anchal'},
 {'Age': 38, 'Name': 'Adam'}]

In [123]:
data1 = make_data()
df1 = pd.DataFrame(data1)
df1

Unnamed: 0,Age,Name
0,21,Alex
1,30,Anchal
2,38,Arnold
3,25,Billy
4,24,Billy
5,38,Anchal
6,38,Jin
7,31,Anthony
8,30,Alan
9,26,Billy


In [124]:
data2 = make_data()
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,Age,Name
0,38,Arnold
1,30,Jin
2,30,Anchal
3,30,Arnold
4,29,Jin
5,29,Alvin
6,30,Adam
7,30,Alvin
8,35,Adam
9,36,Jin


In [125]:
# df1과 df2를 합치고 싶을때 append를 이용할수 있습니다.
df3 = df1.append(df2)
df3

Unnamed: 0,Age,Name
0,21,Alex
1,30,Anchal
2,38,Arnold
3,25,Billy
4,24,Billy
5,38,Anchal
6,38,Jin
7,31,Anthony
8,30,Alan
9,26,Billy


In [132]:
# index 리셋하기
# drop(True) - 새롭게 생성되는 인덱스 컬럼을 삭제합니다.
# inplace(True) - 함수를 사용하는 객체 자체 인덱스를 리셋합니다.
df3.reset_index(drop=True, inplace=True)
df3

Unnamed: 0,Age,Name
0,21,Alex
1,30,Anchal
2,38,Arnold
3,25,Billy
4,24,Billy
5,38,Anchal
6,38,Jin
7,31,Anthony
8,30,Alan
9,26,Billy


In [133]:
# append를 할때 인덱스를 리셋
df3 = df1.append(df2, ignore_index=True)
df3

Unnamed: 0,Age,Name
0,21,Alex
1,30,Anchal
2,38,Arnold
3,25,Billy
4,24,Billy
5,38,Anchal
6,38,Jin
7,31,Anthony
8,30,Alan
9,26,Billy


##### concat
- rows
- columns

In [134]:
# concat rows
df1

Unnamed: 0,Age,Name
0,21,Alex
1,30,Anchal
2,38,Arnold
3,25,Billy
4,24,Billy
5,38,Anchal
6,38,Jin
7,31,Anthony
8,30,Alan
9,26,Billy


In [135]:
df2

Unnamed: 0,Age,Name
0,38,Arnold
1,30,Jin
2,30,Anchal
3,30,Arnold
4,29,Jin
5,29,Alvin
6,30,Adam
7,30,Alvin
8,35,Adam
9,36,Jin


In [137]:
df3 = pd.concat([df1, df2]).reset_index(drop=True)
df3

Unnamed: 0,Age,Name
0,21,Alex
1,30,Anchal
2,38,Arnold
3,25,Billy
4,24,Billy
5,38,Anchal
6,38,Jin
7,31,Anthony
8,30,Alan
9,26,Billy


In [138]:
# concat colums
# axis = 1 설정하면 가로로 합쳐집니다.
pd.concat([df3, df1], axis=1)

Unnamed: 0,Age,Name,Age.1,Name.1
0,21,Alex,21.0,Alex
1,30,Anchal,30.0,Anchal
2,38,Arnold,38.0,Arnold
3,25,Billy,25.0,Billy
4,24,Billy,24.0,Billy
5,38,Anchal,38.0,Anchal
6,38,Jin,38.0,Jin
7,31,Anthony,31.0,Anthony
8,30,Alan,30.0,Alan
9,26,Billy,26.0,Billy


In [140]:
df4 = pd.concat([df1, df3], axis=1, join='inner')
df4

Unnamed: 0,Age,Name,Age.1,Name.1
0,21,Alex,21,Alex
1,30,Anchal,30,Anchal
2,38,Arnold,38,Arnold
3,25,Billy,25,Billy
4,24,Billy,24,Billy
5,38,Anchal,38,Anchal
6,38,Jin,38,Jin
7,31,Anthony,31,Anthony
8,30,Alan,30,Alan
9,26,Billy,26,Billy


##### Group by
- 이름별 평균 나이를 나타내는 데이터 프레임을 만들겁니다.

In [144]:
# 20명에 대한 이름과 나이를 나타내는 데이터 프레임을 만듭니다.
g_df = pd.DataFrame(make_data(20))
g_df.tail()

Unnamed: 0,Age,Name
15,27,Arnold
16,36,Anchal
17,32,Andrew
18,26,Alan
19,32,Billy


In [158]:
# 이름을 unique로 출력
result1 = np.array(list(set(g_df["Name"].values)))
len(result1), result1

(9, array(['Alan', 'Alex', 'Anthony', 'Adam', 'Arnold', 'Billy', 'Jin',
        'Andrew', 'Anchal'], dtype='<U7'))

In [152]:
# pandas의 unique를 이용하여 유니크한 이름을 출력
result2 = g_df["Name"].unique()
len(result2), result2

(9, array(['Anthony', 'Anchal', 'Alex', 'Billy', 'Andrew', 'Jin', 'Alan',
        'Arnold', 'Adam'], dtype=object))

In [164]:
# groupby - size
result_df = g_df.groupby("Name").size().reset_index(name="counts")
result_df

Unnamed: 0,Name,counts
0,Adam,1
1,Alan,2
2,Alex,1
3,Anchal,2
4,Andrew,3
5,Anthony,4
6,Arnold,2
7,Billy,3
8,Jin,2


In [170]:
# sort values
result_df = result_df.sort_values(by=["counts"], ascending=False)
result_df.reset_index(drop=True, inplace=True)
result_df

Unnamed: 0,Name,counts
0,Anthony,4
1,Andrew,3
2,Billy,3
3,Alan,2
4,Anchal,2
5,Arnold,2
6,Jin,2
7,Adam,1
8,Alex,1


In [173]:
# agg : min
# 나이가 제일 어린 나이로 name 그룹핑 합니다.
g_df.groupby("Name").agg("min").reset_index()

Unnamed: 0,Name,Age
0,Adam,20
1,Alan,26
2,Alex,40
3,Anchal,36
4,Andrew,32
5,Anthony,20
6,Arnold,27
7,Billy,23
8,Jin,27


In [174]:
# 가장 나이가 많은 이름으로 그룹핑
g_df.groupby("Name").agg("max").reset_index()

Unnamed: 0,Name,Age
0,Adam,20
1,Alan,33
2,Alex,40
3,Anchal,38
4,Andrew,40
5,Anthony,34
6,Arnold,32
7,Billy,40
8,Jin,28


In [175]:
# agg : mean
g_df.groupby("Name").agg("mean").reset_index()

Unnamed: 0,Name,Age
0,Adam,20.0
1,Alan,29.5
2,Alex,40.0
3,Anchal,37.0
4,Andrew,35.666667
5,Anthony,24.75
6,Arnold,29.5
7,Billy,31.666667
8,Jin,27.5


In [176]:
# agg : sum
g_df.groupby("Name").agg("sum").reset_index()

Unnamed: 0,Name,Age
0,Adam,20
1,Alan,59
2,Alex,40
3,Anchal,74
4,Andrew,107
5,Anthony,99
6,Arnold,59
7,Billy,95
8,Jin,55


In [177]:
# agg : median
g_df.groupby("Name").agg("median").reset_index()

Unnamed: 0,Name,Age
0,Adam,20.0
1,Alan,29.5
2,Alex,40.0
3,Anchal,37.0
4,Andrew,35.0
5,Anthony,22.5
6,Arnold,29.5
7,Billy,32.0
8,Jin,27.5


In [180]:
# agg으로 여러개 컬럼 생성
df = g_df.groupby("Name").agg(["min","max","mean"]).reset_index()
df

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Adam,20,20,20.0
1,Alan,26,33,29.5
2,Alex,40,40,40.0
3,Anchal,36,38,37.0
4,Andrew,32,40,35.666667
5,Anthony,20,34,24.75
6,Arnold,27,32,29.5
7,Billy,23,40,31.666667
8,Jin,27,28,27.5


In [179]:
# select

In [182]:
df.head()

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Adam,20,20,20.0
1,Alan,26,33,29.5
2,Alex,40,40,40.0
3,Anchal,36,38,37.0
4,Andrew,32,40,35.666667


In [185]:
df.tail(3)

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
6,Arnold,27,32,29.5
7,Billy,23,40,31.666667
8,Jin,27,28,27.5


In [186]:
df.tail(n=7)

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
2,Alex,40,40,40.0
3,Anchal,36,38,37.0
4,Andrew,32,40,35.666667
5,Anthony,20,34,24.75
6,Arnold,27,32,29.5
7,Billy,23,40,31.666667
8,Jin,27,28,27.5


In [187]:
df[3:6]

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
3,Anchal,36,38,37.0
4,Andrew,32,40,35.666667
5,Anthony,20,34,24.75


In [191]:
df[3:]

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
3,Anchal,36,38,37.0
4,Andrew,32,40,35.666667
5,Anthony,20,34,24.75
6,Arnold,27,32,29.5
7,Billy,23,40,31.666667
8,Jin,27,28,27.5


In [192]:
df.loc[3]

Name          Anchal
Age   min         36
      max         38
      mean        37
Name: 3, dtype: object

In [193]:
df

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Adam,20,20,20.0
1,Alan,26,33,29.5
2,Alex,40,40,40.0
3,Anchal,36,38,37.0
4,Andrew,32,40,35.666667
5,Anthony,20,34,24.75
6,Arnold,27,32,29.5
7,Billy,23,40,31.666667
8,Jin,27,28,27.5


In [194]:
df.loc[2]["Age"]["min"]

40

In [196]:
df.loc[3]["Name"][""]

'Anchal'

In [198]:
data = {
    "Name":df["Name"],
    "Min":df["Age"]["min"],
    "Max":df["Age"]["max"],
    "Mean":df["Age"]["mean"],
}
n_df = pd.DataFrame(data)
n_df

Unnamed: 0,Max,Mean,Min,Name
0,20,20.0,20,Adam
1,33,29.5,26,Alan
2,40,40.0,40,Alex
3,38,37.0,36,Anchal
4,40,35.666667,32,Andrew
5,34,24.75,20,Anthony
6,32,29.5,27,Arnold
7,40,31.666667,23,Billy
8,28,27.5,27,Jin


In [201]:
# 평균나이가 30세 이상이 데이터를 내림차순으로 정렬하고 인덱스를 재설정
n_df[n_df["Mean"]>30].sort_values(by=["Mean"], ascending=False).reset_index(drop=True)

Unnamed: 0,Max,Mean,Min,Name
0,40,40.0,40,Alex
1,38,37.0,36,Anchal
2,40,35.666667,32,Andrew
3,40,31.666667,23,Billy


In [204]:
g_df

Unnamed: 0,Age,Name
0,34,Anthony
1,38,Anchal
2,40,Alex
3,40,Billy
4,35,Andrew
5,27,Jin
6,40,Andrew
7,33,Alan
8,23,Anthony
9,28,Jin


In [205]:
n_df

Unnamed: 0,Max,Mean,Min,Name
0,20,20.0,20,Adam
1,33,29.5,26,Alan
2,40,40.0,40,Alex
3,38,37.0,36,Anchal
4,40,35.666667,32,Andrew
5,34,24.75,20,Anthony
6,32,29.5,27,Arnold
7,40,31.666667,23,Billy
8,28,27.5,27,Jin


In [206]:
n_df["Count"] = list(g_df.groupby("Name").size())
n_df

Unnamed: 0,Max,Mean,Min,Name,Count
0,20,20.0,20,Adam,1
1,33,29.5,26,Alan,2
2,40,40.0,40,Alex,1
3,38,37.0,36,Anchal,2
4,40,35.666667,32,Andrew,3
5,34,24.75,20,Anthony,4
6,32,29.5,27,Arnold,2
7,40,31.666667,23,Billy,3
8,28,27.5,27,Jin,2


In [208]:
# drop - mean 데이터를 가장 뒤로 이동시키겠습니다.
mean = n_df["Mean"]
n_df.drop("Mean", axis=1, inplace=True)
n_df

Unnamed: 0,Max,Min,Name,Count
0,20,20,Adam,1
1,33,26,Alan,2
2,40,40,Alex,1
3,38,36,Anchal,2
4,40,32,Andrew,3
5,34,20,Anthony,4
6,32,27,Arnold,2
7,40,23,Billy,3
8,28,27,Jin,2


In [209]:
n_df["Mean"] = mean

In [210]:
n_df

Unnamed: 0,Max,Min,Name,Count,Mean
0,20,20,Adam,1,20.0
1,33,26,Alan,2,29.5
2,40,40,Alex,1,40.0
3,38,36,Anchal,2,37.0
4,40,32,Andrew,3,35.666667
5,34,20,Anthony,4,24.75
6,32,27,Arnold,2,29.5
7,40,23,Billy,3,31.666667
8,28,27,Jin,2,27.5


In [212]:
# rename colum
n_df.rename(columns={"Max":"Maximum","Name":"Unique_Name"})

Unnamed: 0,Maximum,Min,Unique_Name,Count,Mean
0,20,20,Adam,1,20.0
1,33,26,Alan,2,29.5
2,40,40,Alex,1,40.0
3,38,36,Anchal,2,37.0
4,40,32,Andrew,3,35.666667
5,34,20,Anthony,4,24.75
6,32,27,Arnold,2,29.5
7,40,23,Billy,3,31.666667
8,28,27,Jin,2,27.5


##### Merge = sql(join)
- user_df : 아이디, 이름, 나이 데이터 프레임 생성
- money_df : 아이디, 돈 데이터 프레임을 생성

In [213]:
user_df = pd.DataFrame(columns=["UserID", "Name", "Age"])
for idx in range(1,9):
    name = get_name()
    
    # 중복 이름 제거
    while name in list(user_df["Name"]):
        name = get_name()
        
    # 데이터 name_df insert
    data = {"Name":name, "UserID":idx, "Age":get_age()}
    user_df.loc[len(user_df)] = data
    
user_df

Unnamed: 0,UserID,Name,Age
0,1,Anthony,38
1,2,Billy,33
2,3,Alex,27
3,4,Adam,36
4,5,Arnold,21
5,6,Andrew,32
6,7,Alvin,26
7,8,Jin,38


In [214]:
money_df = pd.DataFrame(columns=["ID", "Money"])

for idx in range(15):
    money = random.randint(1, 20) * 1000
    data = {"Money":money, "ID":random.randint(1, 8)}
    money_df.loc[len(money_df)] = data
    
money_df    

Unnamed: 0,ID,Money
0,8,9000
1,3,15000
2,5,10000
3,6,3000
4,5,3000
5,4,14000
6,6,7000
7,7,14000
8,3,10000
9,7,19000


In [216]:
# merge - user_df, money_df - key:ID, UserID
# money 데이터 기준으로 merge
money_df.merge(user_df, left_on="ID", right_on="UserID")

Unnamed: 0,ID,Money,UserID,Name,Age
0,8,9000,8,Jin,38
1,8,7000,8,Jin,38
2,3,15000,3,Alex,27
3,3,10000,3,Alex,27
4,5,10000,5,Arnold,21
5,5,3000,5,Arnold,21
6,6,3000,6,Andrew,32
7,6,7000,6,Andrew,32
8,6,13000,6,Andrew,32
9,4,14000,4,Adam,36


In [218]:
# money 데이터 기준으로 merge
user_df.merge(money_df, left_on="UserID", right_on="ID")

Unnamed: 0,UserID,Name,Age,ID,Money
0,2,Billy,33,2,14000
1,2,Billy,33,2,14000
2,3,Alex,27,3,15000
3,3,Alex,27,3,10000
4,4,Adam,36,4,14000
5,4,Adam,36,4,2000
6,5,Arnold,21,5,10000
7,5,Arnold,21,5,3000
8,6,Andrew,32,6,3000
9,6,Andrew,32,6,7000


In [220]:
user_df.rename(columns={"UserID":"ID"}, inplace=True)
user_df

Unnamed: 0,ID,Name,Age
0,1,Anthony,38
1,2,Billy,33
2,3,Alex,27
3,4,Adam,36
4,5,Arnold,21
5,6,Andrew,32
6,7,Alvin,26
7,8,Jin,38


In [224]:
result_df = pd.merge(money_df, user_df)
result_df

Unnamed: 0,ID,Money,Name,Age
0,8,9000,Jin,38
1,8,7000,Jin,38
2,3,15000,Alex,27
3,3,10000,Alex,27
4,5,10000,Arnold,21
5,5,3000,Arnold,21
6,6,3000,Andrew,32
7,6,7000,Andrew,32
8,6,13000,Andrew,32
9,4,14000,Adam,36


In [231]:
money_list = result_df.groupby("Name").sum()["Money"].reset_index()
money_list

Unnamed: 0,Name,Money
0,Adam,16000
1,Alex,25000
2,Alvin,33000
3,Andrew,23000
4,Arnold,13000
5,Billy,28000
6,Jin,16000


In [230]:
df

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Adam,20,20,20.0
1,Alan,26,33,29.5
2,Alex,40,40,40.0
3,Anchal,36,38,37.0
4,Andrew,32,40,35.666667
5,Anthony,20,34,24.75
6,Arnold,27,32,29.5
7,Billy,23,40,31.666667
8,Jin,27,28,27.5


In [234]:
df1

Unnamed: 0,Age,Name
0,21,Alex
1,30,Anchal
2,38,Arnold
3,25,Billy
4,24,Billy
5,38,Anchal
6,38,Jin
7,31,Anthony
8,30,Alan
9,26,Billy


In [237]:
# 38세 데이터만 필터링
df1[df1["Age"] == 38]

Unnamed: 0,Age,Name
2,38,Arnold
5,38,Anchal
6,38,Jin


In [239]:
money_list

Unnamed: 0,Name,Money
0,Adam,16000
1,Alex,25000
2,Alvin,33000
3,Andrew,23000
4,Arnold,13000
5,Billy,28000
6,Jin,16000


In [241]:
# merge - outer
# fillna - NaN을 특정 데이터로 채워줍니다.
result = pd.merge(user_df, money_list, how='outer').fillna(value=0)
result

Unnamed: 0,ID,Name,Age,Money
0,1,Anthony,38,0.0
1,2,Billy,33,28000.0
2,3,Alex,27,25000.0
3,4,Adam,36,16000.0
4,5,Arnold,21,13000.0
5,6,Andrew,32,23000.0
6,7,Alvin,26,33000.0
7,8,Jin,38,16000.0


In [244]:
# change data type
result["Money"] = result["Money"].astype("int")
result

Unnamed: 0,ID,Name,Age,Money
0,1,Anthony,38,0
1,2,Billy,33,28000
2,3,Alex,27,25000
3,4,Adam,36,16000
4,5,Arnold,21,13000
5,6,Andrew,32,23000
6,7,Alvin,26,33000
7,8,Jin,38,16000


##### Dataframe Input / Output
- csv, excel
- `$ pip3 install xlrd`
- `$ pip3 install openpyxl`

In [247]:
result

Unnamed: 0,ID,Name,Age,Money
0,1,Anthony,38,0
1,2,Billy,33,28000
2,3,Alex,27,25000
3,4,Adam,36,16000
4,5,Arnold,21,13000
5,6,Andrew,32,23000
6,7,Alvin,26,33000
7,8,Jin,38,16000


In [249]:
# save csv
result.to_csv('foo.csv', index=False)

In [250]:
# load csv
df = pd.read_csv('foo.csv')
df

Unnamed: 0,ID,Name,Age,Money
0,1,Anthony,38,0
1,2,Billy,33,28000
2,3,Alex,27,25000
3,4,Adam,36,16000
4,5,Arnold,21,13000
5,6,Andrew,32,23000
6,7,Alvin,26,33000
7,8,Jin,38,16000


In [257]:
!pwd

/Users/rada/Documents/fastcampus/dss8/02_numpy_pandas/B


In [258]:
# excel 은 저장 되는 인코딩 타입을 확인해야 합니다.(utf-8을 사용하지 않습니다.)
df.to_excel('../ttt/foo.xlsx', sheet_name='Sheet1')

In [259]:
path = "/Users/rada/Documents/fastcampus/dss8/02_numpy_pandas/B/test/"
df.to_excel(path+'foo.xlsx', sheet_name='Sheet1')

In [255]:
# load excel
df = pd.read_excel('foo.xlsx', 'Sheet1')
df

Unnamed: 0,ID,Name,Age,Money
0,1,Anthony,38,0
1,2,Billy,33,28000
2,3,Alex,27,25000
3,4,Adam,36,16000
4,5,Arnold,21,13000
5,6,Andrew,32,23000
6,7,Alvin,26,33000
7,8,Jin,38,16000


##### pivot
- 데이터 프레임의 컬럼 데이터에서 index, columns, values를 선택해서 데이터 프레임을 만드는 방법
- `df.pivot(index, columns, values)`

In [1]:
# 타이타닉 데이터
# Survived - 0:no, 1:yes
titanic = pd.read_csv("train.csv")
titanic.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


```
등급      1    2    3 
성별
----------------------
female  000  000  000
male    000  000  000
```

In [3]:
# groupby
titanic_df1 = titanic.groupby(["Sex", "Pclass"]).size().reset_index(name="Counts")
titanic_df1

Unnamed: 0,Sex,Pclass,Counts
0,female,1,94
1,female,2,76
2,female,3,144
3,male,1,122
4,male,2,108
5,male,3,347


In [4]:
titanic_df1 = titanic_df1.pivot("Sex", "Pclass", "Counts")
titanic_df1

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,94,76,144
male,122,108,347


In [5]:
# 생존 데이터
# 성별에 따른 생존자 수를 나타내는 데이터 프레임을 만드세요

```
생존      0    1   
성별
------------------
female  000  000  
male    000  000  
```

In [7]:
df2 = titanic.groupby(["Sex","Survived"]).size().reset_index(name="Counts")
df2

Unnamed: 0,Sex,Survived,Counts
0,female,0,81
1,female,1,233
2,male,0,468
3,male,1,109


In [8]:
df2.pivot("Sex","Survived","Counts")

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [9]:
# 객실등급에 따른 생존자수 
df3 = titanic.groupby(["Pclass","Survived"]).size().reset_index(name="Counts")
df3

Unnamed: 0,Pclass,Survived,Counts
0,1,0,80
1,1,1,136
2,2,0,97
3,2,1,87
4,3,0,372
5,3,1,119


In [11]:
df3.pivot("Pclass","Survived","Counts")

Survived,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80,136
2,97,87
3,372,119


##### pivot table
- `pivot_table(values, index, columns, aggfunc)`
- fill_value : NaN 데이터를 우리가 설정한 데이터로 치환해주는 파라미터 입니다.
- dropna(True) : NaN 데이터 컬럼을 놔둘지 제거할지 결정할때 사용됩니다.

In [13]:
titanic["Counts"] = 1
titanic.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Counts
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C,1
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q,1


In [14]:
# 객실등급에 따른 남녀 승객수
titanic.pivot_table("Counts", ["Sex"], ["Pclass"], aggfunc=np.sum)

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,94,76,144
male,122,108,347


In [18]:
# 성별에 따른 생존자 수
titanic.pivot_table("Counts", ["Sex"], ["Survived"], aggfunc=np.sum)

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [17]:
# 객실등급(Pclass)에 따른 생존자 수 
titanic.pivot_table("Counts", ["Pclass"], ["Survived"], aggfunc=np.sum)

Survived,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80,136
2,97,87
3,372,119


In [19]:
titanic.pivot_table("Counts", ["Sex","Pclass"], ["Survived"], aggfunc=np.sum)

Unnamed: 0_level_0,Survived,0,1
Sex,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
female,1,3,91
female,2,6,70
female,3,72,72
male,1,77,45
male,2,91,17
male,3,300,47


In [21]:
df = titanic.pivot_table("Counts", ["Survived"], ["Sex"], aggfunc=np.sum)
df

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,468
1,233,109


In [25]:
df["total"] = df["female"] + df["male"]
df

Sex,female,male,total
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,81,468,549
1,233,109,342


In [27]:
df.loc["total"] = df.loc[0] +  df.loc[1]
df

Sex,female,male,total
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,81,468,549
1,233,109,342
total,314,577,891


In [28]:
# axis = 0 : row 를 삭제합니다.
df.drop("total", inplace=True)
df

Sex,female,male,total
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,81,468,549
1,233,109,342


In [29]:
# axis = 1 : column 를 삭제합니다.
df.drop("total", axis=1, inplace=True)
df

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,468
1,233,109


In [31]:
# fill_value
df = titanic.pivot_table("Counts", ["Survived"], ["Parch","Pclass"],\
                         aggfunc=np.sum, fill_value=0)
df

Parch,0,0,0,1,1,1,2,2,2,3,3,4,4,5,6
Pclass,1,2,3,1,2,3,1,2,3,2,3,1,3,3,3
Survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
0,64,86,295,10,8,35,5,3,32,0,2,1,3,4,1
1,99,48,86,21,24,20,16,13,11,2,1,0,0,1,0


In [36]:
# dropna
df = titanic.pivot_table("Counts", ["Parch","Pclass"], ["Survived"],\
                         aggfunc=np.sum, dropna=False, fill_value=0)
df

Unnamed: 0_level_0,Survived,0,1
Parch,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,64,99
0,2,86,48
0,3,295,86
1,1,10,21
1,2,8,24
1,3,35,20
2,1,5,16
2,2,3,13
2,3,32,11
3,1,0,0
