# Kombinera dataset

Pandas har flera sätt att kombinera dataset

- Concat: kombinera DataFrames
- Join: kombinera på en rad eller kolumn
- Merge: Kombinera liknande? rader och kolumner

In [1]:
import pandas as pd
from helpers import sample_df, hdisplay, nowrap_display



## Concatenate

In [2]:
left = sample_df("A0", "D3", prefix="L_")
right = sample_df("A0", "D3", prefix="R_")

hdisplay([left, right], ["left", "right"])

Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3

Unnamed: 0,A,B,C,D
0,R_A0,R_B0,R_C0,R_D0
1,R_A1,R_B1,R_C1,R_D1
2,R_A2,R_B2,R_C2,R_D2
3,R_A3,R_B3,R_C3,R_D3


In [3]:
pd.concat([left, right], axis="columns") # default axis=index

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,L_A0,L_B0,L_C0,L_D0,R_A0,R_B0,R_C0,R_D0
1,L_A1,L_B1,L_C1,L_D1,R_A1,R_B1,R_C1,R_D1
2,L_A2,L_B2,L_C2,L_D2,R_A2,R_B2,R_C2,R_D2
3,L_A3,L_B3,L_C3,L_D3,R_A3,R_B3,R_C3,R_D3


In [4]:
pd.concat([left, right]).reset_index(drop=True)
pd.concat([left, right]).set_index("C")
pd.concat([left, right], axis="columns", keys=["left", "right"])
pd.concat([left, right], ignore_index=True)

Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3
4,R_A0,R_B0,R_C0,R_D0
5,R_A1,R_B1,R_C1,R_D1
6,R_A2,R_B2,R_C2,R_D2
7,R_A3,R_B3,R_C3,R_D3


In [5]:
left = sample_df("A0", "D3", prefix="L_")
right = sample_df("C2", "F5", prefix="R_")

hdisplay([left, right], ["left", "right"])

Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3

Unnamed: 0,C,D,E,F
2,R_C2,R_D2,R_E2,R_F2
3,R_C3,R_D3,R_E3,R_F3
4,R_C4,R_D4,R_E4,R_F4
5,R_C5,R_D5,R_E5,R_F5


In [6]:
pd.concat([left, right], axis="columns")

Unnamed: 0,A,B,C,D,C.1,D.1,E,F
0,L_A0,L_B0,L_C0,L_D0,,,,
1,L_A1,L_B1,L_C1,L_D1,,,,
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3
4,,,,,R_C4,R_D4,R_E4,R_F4
5,,,,,R_C5,R_D5,R_E5,R_F5


In [7]:
df = pd.concat([left, right], keys=["left", "right"])
df.loc['left']

Unnamed: 0,A,B,C,D,E,F
0,L_A0,L_B0,L_C0,L_D0,,
1,L_A1,L_B1,L_C1,L_D1,,
2,L_A2,L_B2,L_C2,L_D2,,
3,L_A3,L_B3,L_C3,L_D3,,


## Join
- Tänk på att Vänster/Höger är en grej i syntax. (inte bara nu i våra tillfälliga variabelnamn)

In [8]:
left = sample_df("A0", "D3", prefix="L_").add_suffix("_left")
right = sample_df("C2", "F5", prefix="R_").add_suffix("_right")

hdisplay([left, right], ["left", "right"])

Unnamed: 0,A_left,B_left,C_left,D_left
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3

Unnamed: 0,C_right,D_right,E_right,F_right
2,R_C2,R_D2,R_E2,R_F2
3,R_C3,R_D3,R_E3,R_F3
4,R_C4,R_D4,R_E4,R_F4
5,R_C5,R_D5,R_E5,R_F5


In [9]:
left.join(right, how="right") # finns även inner(tar bara med non-NaN r/k), outer(tar med allt), cross(tar med alla kombos len(r)*len(l)) (left default)

Unnamed: 0,A_left,B_left,C_left,D_left,C_right,D_right,E_right,F_right
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3
4,,,,,R_C4,R_D4,R_E4,R_F4
5,,,,,R_C5,R_D5,R_E5,R_F5


## Merge
- Lite som join, men mer valbart om hur dataframsen ska sättas ihop

- how= finns här också men här är istället default "inner"

- Lätt att blandas ihop med SQL syntax där SQLs merge heter .join

In [10]:
left = sample_df("A0", "D3")
left.loc[:, "F"] = ["F8", "F9", "F10", "F11"]
right = sample_df("F10", "J13")
hdisplay([left, right], ["left", "right"])

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,F8
1,A1,B1,C1,D1,F9
2,A2,B2,C2,D2,F10
3,A3,B3,C3,D3,F11

Unnamed: 0,F,G,H,I,J
10,F10,G10,H10,I10,J10
11,F11,G11,H11,I11,J11
12,F12,G12,H12,I12,J12
13,F13,G13,H13,I13,J13


In [11]:
left.merge(right, how="outer")

Unnamed: 0,A,B,C,D,F,G,H,I,J
0,A0,B0,C0,D0,F8,,,,
1,A1,B1,C1,D1,F9,,,,
2,A2,B2,C2,D2,F10,G10,H10,I10,J10
3,A3,B3,C3,D3,F11,G11,H11,I11,J11
4,,,,,F12,G12,H12,I12,J12
5,,,,,F13,G13,H13,I13,J13


## Lite riktiga exempel

In [12]:
employees = pd.read_json("../Data/employees.json")
departments = pd.read_json("../Data/departments.json")
display(employees)
display(departments)

Unnamed: 0,first_name,last_name,job_title,salary,department
0,John,Doe,Sales Director,120000,Sales
1,Jane,Smith,HR Coordinator,60000,Human Resources
2,Michael,Johnson,Software Engineer,110000,IT
3,Sarah,Williams,Marketing Specialist,75000,Sales
4,David,Brown,HR Manager,90000,Human Resources
5,Emily,Davis,IT Support Specialist,55000,IT
6,Jacob,Wilson,Sales Representative,80000,Sales
7,Olivia,Moore,Marketing Manager,100000,Sales
8,Ethan,Lee,Financial Analyst,85000,Sales
9,Sophia,Taylor,HR Assistant,50000,Human Resources


Unnamed: 0,department_name,department_head,location,office_number,budget
0,Sales,Sarah Williams,New York,101,1000000
1,Human Resources,David Brown,Chicago,202,800000
2,IT,Michael Johnson,San Francisco,303,1200000


In [13]:
df = employees.merge(departments, left_on="department", right_on="department_name", how="left", sort=True).drop(columns="department_name")
nowrap_display(df)

Unnamed: 0,first_name,last_name,job_title,salary,department,department_head,location,office_number,budget
0,Jane,Smith,HR Coordinator,60000,Human Resources,David Brown,Chicago,202,800000
1,David,Brown,HR Manager,90000,Human Resources,David Brown,Chicago,202,800000
2,Sophia,Taylor,HR Assistant,50000,Human Resources,David Brown,Chicago,202,800000
3,Michael,Johnson,Software Engineer,110000,IT,Michael Johnson,San Francisco,303,1200000
4,Emily,Davis,IT Support Specialist,55000,IT,Michael Johnson,San Francisco,303,1200000
5,John,Doe,Sales Director,120000,Sales,Sarah Williams,New York,101,1000000
6,Sarah,Williams,Marketing Specialist,75000,Sales,Sarah Williams,New York,101,1000000
7,Jacob,Wilson,Sales Representative,80000,Sales,Sarah Williams,New York,101,1000000
8,Olivia,Moore,Marketing Manager,100000,Sales,Sarah Williams,New York,101,1000000
9,Ethan,Lee,Financial Analyst,85000,Sales,Sarah Williams,New York,101,1000000


In [14]:
users = pd.read_json("../Data/users.json")
users

Unnamed: 0,username,password,email,phone,host
0,johdoe,7e684c07e48bf68c0181306c2dab1a0e2b8298e9a59a37...,john.doe@mockcompany.com,(212) 591-7254,192.168.1.1
1,jansmi,a6ab128c25d59951f9eb29a0b63806a0a2df924a384f61...,jane.smith@mockcompany.com,(312) 623-3364,192.168.1.2
2,micjoh,9f01f5cfa05a6b14367d4d0e6c788c5977f094ad4e2381...,michael.johnson@mockcompany.com,(415) 602-6872,192.168.1.3
3,sarwil,fdb1e5c757d39e5f27065f12f27fbc94d50f47a66e8cdd...,sarah.williams@mockcompany.com,(212) 623-6568,192.168.1.1
4,davbro,1d99b7d777fb1b04ac1bea3d4b04a4ea5f654f9b304da4...,david.brown@mockcompany.com,(312) 709-8933,192.168.1.2
5,emidav,1953b5a2e8cc313b6db9f1708e9d0b84a84c6b3a60706e...,emily.davis@mockcompany.com,(415) 775-6149,192.168.1.3
6,jacwil,8d5ee4f5e3a5e9202a2f21d3b7c30a10c9c51d0133a731...,jacob.wilson@mockcompany.com,(212) 285-2027,192.168.1.1
7,olimoo,79cfd142536a5d27528d3306ef28e26d038c625764167b...,olivia.moore@mockcompany.com,(212) 335-5297,192.168.1.1
8,ethlee,5f51c17c9b81d68a1c9b99f1a8627a457bd0a4048d594c...,ethan.lee@mockcompany.com,(212) 509-8922,192.168.1.1
9,soptay,a429b013ebfb9f86a3cb8cf2311e2bfa3b00b238d439eb...,sophia.taylor@mockcompany.com,(312) 826-6711,192.168.1.2


In [15]:
df.insert(9, "username", 
        df["first_name"].apply(lambda name: name[:3].lower()) + 
        df["last_name"].apply(lambda name: name[:3].lower())
    )

df

Unnamed: 0,first_name,last_name,job_title,salary,department,department_head,location,office_number,budget,username
0,Jane,Smith,HR Coordinator,60000,Human Resources,David Brown,Chicago,202,800000,jansmi
1,David,Brown,HR Manager,90000,Human Resources,David Brown,Chicago,202,800000,davbro
2,Sophia,Taylor,HR Assistant,50000,Human Resources,David Brown,Chicago,202,800000,soptay
3,Michael,Johnson,Software Engineer,110000,IT,Michael Johnson,San Francisco,303,1200000,micjoh
4,Emily,Davis,IT Support Specialist,55000,IT,Michael Johnson,San Francisco,303,1200000,emidav
5,John,Doe,Sales Director,120000,Sales,Sarah Williams,New York,101,1000000,johdoe
6,Sarah,Williams,Marketing Specialist,75000,Sales,Sarah Williams,New York,101,1000000,sarwil
7,Jacob,Wilson,Sales Representative,80000,Sales,Sarah Williams,New York,101,1000000,jacwil
8,Olivia,Moore,Marketing Manager,100000,Sales,Sarah Williams,New York,101,1000000,olimoo
9,Ethan,Lee,Financial Analyst,85000,Sales,Sarah Williams,New York,101,1000000,ethlee


In [17]:
nowrap_display(df.merge(users, on="username", how="left"))

Unnamed: 0,first_name,last_name,job_title,salary,department,department_head,location,office_number,budget,username,password,email,phone,host
0,Jane,Smith,HR Coordinator,60000,Human Resources,David Brown,Chicago,202,800000,jansmi,a6ab128c25d59951f9eb29a0b63806a0a2df924a384f61ed6a3f5b58d10b197a,jane.smith@mockcompany.com,(312) 623-3364,192.168.1.2
1,David,Brown,HR Manager,90000,Human Resources,David Brown,Chicago,202,800000,davbro,1d99b7d777fb1b04ac1bea3d4b04a4ea5f654f9b304da4a4d2c6c5f5f3a7c5f5,david.brown@mockcompany.com,(312) 709-8933,192.168.1.2
2,Sophia,Taylor,HR Assistant,50000,Human Resources,David Brown,Chicago,202,800000,soptay,a429b013ebfb9f86a3cb8cf2311e2bfa3b00b238d439ebd0c24b1714de4d75f4,sophia.taylor@mockcompany.com,(312) 826-6711,192.168.1.2
3,Michael,Johnson,Software Engineer,110000,IT,Michael Johnson,San Francisco,303,1200000,micjoh,9f01f5cfa05a6b14367d4d0e6c788c5977f094ad4e2381d632692c27b8a5c5d7,michael.johnson@mockcompany.com,(415) 602-6872,192.168.1.3
4,Emily,Davis,IT Support Specialist,55000,IT,Michael Johnson,San Francisco,303,1200000,emidav,1953b5a2e8cc313b6db9f1708e9d0b84a84c6b3a60706ea8c1db8f6f24e18e62,emily.davis@mockcompany.com,(415) 775-6149,192.168.1.3
5,John,Doe,Sales Director,120000,Sales,Sarah Williams,New York,101,1000000,johdoe,7e684c07e48bf68c0181306c2dab1a0e2b8298e9a59a37b9f30d327c1b5ed9cd,john.doe@mockcompany.com,(212) 591-7254,192.168.1.1
6,Sarah,Williams,Marketing Specialist,75000,Sales,Sarah Williams,New York,101,1000000,sarwil,fdb1e5c757d39e5f27065f12f27fbc94d50f47a66e8cddcb59c0b1d18b1e2369,sarah.williams@mockcompany.com,(212) 623-6568,192.168.1.1
7,Jacob,Wilson,Sales Representative,80000,Sales,Sarah Williams,New York,101,1000000,jacwil,8d5ee4f5e3a5e9202a2f21d3b7c30a10c9c51d0133a731d6e542536a8d63d1f3,jacob.wilson@mockcompany.com,(212) 285-2027,192.168.1.1
8,Olivia,Moore,Marketing Manager,100000,Sales,Sarah Williams,New York,101,1000000,olimoo,79cfd142536a5d27528d3306ef28e26d038c625764167b4e8d44b35ab02b4df6,olivia.moore@mockcompany.com,(212) 335-5297,192.168.1.1
9,Ethan,Lee,Financial Analyst,85000,Sales,Sarah Williams,New York,101,1000000,ethlee,5f51c17c9b81d68a1c9b99f1a8627a457bd0a4048d594c93e9e20c8f98f26e21,ethan.lee@mockcompany.com,(212) 509-8922,192.168.1.1


In [19]:
new_df = df.copy()

In [25]:
new_df.groupby(["department", "department_head", "location", "office_number", "budget"])["salary"].mean()

department       department_head  location       office_number  budget 
Human Resources  David Brown      Chicago        202            800000     66666.666667
IT               Michael Johnson  San Francisco  303            1200000    82500.000000
Sales            Sarah Williams   New York       101            1000000    92000.000000
Name: salary, dtype: float64