https://www.datacamp.com/courses/manipulating-dataframes-with-pandas
# 1. Pivoting

In [1]:
import pandas as pd
import numpy as np

In [3]:
trial1 = pd.read_csv('datasets/trials_01.csv')
trial1

Unnamed: 0,id,treatment,gender,response
0,1,A,F,5
1,2,A,M,3
2,3,B,F,8
3,4,B,M,9


## 1) Reshaping by pivoting: df.pivot( )
- index = 'xx'
- columns = 'xx'
- values = 'xx'

In [4]:
trial1.pivot(index='treatment', columns='gender', values='response')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,3
B,8,9


In [5]:
trial1.pivot(index='treatment', columns='gender')

Unnamed: 0_level_0,id,id,response,response
gender,F,M,F,M
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,1,2,5,3
B,3,4,8,9


# Practice 1

In [79]:
users = pd.read_csv('datasets/users.csv')
users

Unnamed: 0.1,Unnamed: 0,weekday,city,visitors,signups
0,0,Sun,Austin,139,7
1,1,Sun,Dallas,237,12
2,2,Mon,Austin,326,3
3,3,Mon,Dallas,456,5


In [80]:
del users['Unnamed: 0']

In [81]:
users.pivot(index='weekday', columns='city', values='visitors')

city,Austin,Dallas
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,326,456
Sun,139,237


In [82]:
users.pivot(index='weekday', columns='city', values='signups')

city,Austin,Dallas
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,3,5
Sun,7,12


In [83]:
users.pivot(index='weekday', columns='city')

Unnamed: 0_level_0,visitors,visitors,signups,signups
city,Austin,Dallas,Austin,Dallas
weekday,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Mon,326,456,3,5
Sun,139,237,7,12


# 2. Stacking & unstacking DataFrames

In [26]:
trial1= pd.read_csv('datasets/trials_01.csv')
trial1

Unnamed: 0,id,treatment,gender,response
0,1,A,F,5
1,2,A,M,3
2,3,B,F,8
3,4,B,M,9


## 1) Stack by using df.set_index(['xx', 'xx'])

In [27]:
trial1 = trial1.set_index(['treatment', 'gender'])
trial1

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
treatment,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
A,F,1,5
A,M,2,3
B,F,3,8
B,M,4,9


## 2) Unstack: df.unstack(level='xx') is the same as df.unstack(level=1)

In [39]:
trial1.unstack(level='gender')

Unnamed: 0_level_0,id,id,response,response
gender,F,M,F,M
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,1,2,5,3
B,3,4,8,9


In [34]:
trial1.unstack(level=1)

Unnamed: 0_level_0,id,id,response,response
gender,F,M,F,M
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,1,2,5,3
B,3,4,8,9


## 3) Stacking DataFrames

In [48]:
trial1_df = trial1.unstack(level='gender')
trial1_df

Unnamed: 0_level_0,id,id,response,response
gender,F,M,F,M
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,1,2,5,3
B,3,4,8,9


In [50]:
nice = trial1_df.stack(level='gender')
nice

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
treatment,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
A,F,1,5
A,M,2,3
B,F,3,8
B,M,4,9


## 4) Swapping levels: df.swaplevel(0, 1)

In [52]:
nice.swaplevel(0, 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
gender,treatment,Unnamed: 2_level_1,Unnamed: 3_level_1
F,A,1,5
M,A,2,3
F,B,3,8
M,B,4,9


In [53]:
nice.swaplevel(0,1).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
gender,treatment,Unnamed: 2_level_1,Unnamed: 3_level_1
F,A,1,5
F,B,3,8
M,A,2,3
M,B,4,9


# 3. Melting DataFrames: pd.melt(df)

In [90]:
trial1 = pd.read_csv('datasets/trials_01.csv')
trial1.pivot(index='treatment', columns='gender', values='response')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,3
B,8,9


In [95]:
pd.melt(trial1)

Unnamed: 0,variable,value
0,id,1
1,id,2
2,id,3
3,id,4
4,treatment,A
5,treatment,A
6,treatment,B
7,treatment,B
8,gender,F
9,gender,M


In [112]:
trial1 = pd.read_csv('datasets/trials_01.csv')
trial1.pivot(index='treatment', columns='gender', values='response')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,3
B,8,9


In [113]:
del trial1['id']

In [114]:
pd.melt(trial1, id_vars=['treatment', 'gender'])

Unnamed: 0,treatment,gender,variable,value
0,A,F,response,5
1,A,M,response,3
2,B,F,response,8
3,B,M,response,9


In [125]:
trial2 = pd.read_csv('datasets/trials_02.csv')
trial2

Unnamed: 0,treatment,F,M
0,A,5,3
1,B,8,9


In [126]:
pd.melt(trial2, id_vars=['treatment'],value_vars=['F', 'M'])

Unnamed: 0,treatment,variable,value
0,A,F,5
1,B,F,8
2,A,M,3
3,B,M,9


In [127]:
pd.melt(trial2, id_vars=['treatment'], var_name='gender', value_name='response')

Unnamed: 0,treatment,gender,response
0,A,F,5
1,B,F,8
2,A,M,3
3,B,M,9


# Practice 3

In [139]:
new_users = users.pivot(index='weekday', columns='city', values='visitors')
new_users

city,Austin,Dallas
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,326,456
Sun,139,237


In [145]:
good = new_users.reset_index()
good

city,weekday,Austin,Dallas
0,Mon,326,456
1,Sun,139,237


In [146]:
pd.melt(good, id_vars=['weekday'], value_name='visitors')

Unnamed: 0,weekday,city,visitors
0,Mon,Austin,326
1,Sun,Austin,139
2,Mon,Dallas,456
3,Sun,Dallas,237


In [148]:
skinny = pd.melt(users, id_vars=['weekday', 'city'])
skinny

Unnamed: 0,weekday,city,variable,value
0,Sun,Austin,visitors,139
1,Sun,Dallas,visitors,237
2,Mon,Austin,visitors,326
3,Mon,Dallas,visitors,456
4,Sun,Austin,signups,7
5,Sun,Dallas,signups,12
6,Mon,Austin,signups,3
7,Mon,Dallas,signups,5


In [152]:
users2 = users.set_index(['city', 'weekday'])
users2

Unnamed: 0_level_0,Unnamed: 1_level_0,visitors,signups
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1
Austin,Sun,139,7
Dallas,Sun,237,12
Austin,Mon,326,3
Dallas,Mon,456,5


In [153]:
pd.melt(users2, col_level=0)

Unnamed: 0,variable,value
0,visitors,139
1,visitors,237
2,visitors,326
3,visitors,456
4,signups,7
5,signups,12
6,signups,3
7,signups,5


# 4. Pivot tables: df.pivot_table( )
- df.pivot_table( ): deal with multiple values for the same index/column pair using a reduction
- pivot requires unique index/column pairs to identify values in the new table.

In [155]:
trial3 = pd.read_csv('datasets/trials_03.csv')
trial3

Unnamed: 0,id,treatment,gender,response
0,1,A,F,5
1,2,A,M,3
2,3,A,M,8
3,4,A,F,9
4,5,B,F,1
5,6,B,M,8
6,7,B,F,4
7,8,B,F,6


In [159]:
trial3.pivot_table(index='treatment', columns='gender', values='response')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,7.0,5.5
B,3.666667,8.0


In [160]:
# aggfunc='count' is a frequency count
trial3.pivot_table(index='treatment', columns='gender', values='response', aggfunc='count')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2,2
B,3,1


# Practice 4

In [161]:
users

Unnamed: 0,weekday,city,visitors,signups
0,Sun,Austin,139,7
1,Sun,Dallas,237,12
2,Mon,Austin,326,3
3,Mon,Dallas,456,5


In [162]:
users.pivot_table(index='weekday', columns='city')

Unnamed: 0_level_0,signups,signups,visitors,visitors
city,Austin,Dallas,Austin,Dallas
weekday,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Mon,3,5,326,456
Sun,7,12,139,237


In [163]:
users.pivot_table(index='weekday', aggfunc='count')

Unnamed: 0_level_0,city,signups,visitors
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mon,2,2,2
Sun,2,2,2


In [164]:
users.pivot_table(index='weekday', aggfunc=len)

Unnamed: 0_level_0,city,signups,visitors
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mon,2,2,2
Sun,2,2,2


In [167]:
users.pivot_table(index='weekday', aggfunc=sum)

Unnamed: 0_level_0,signups,visitors
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,8,782
Sun,19,376


In [170]:
# margins=True: add a row of total
users.pivot_table(index='weekday', aggfunc=sum, margins=True)

Unnamed: 0_level_0,signups,visitors
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,8,782
Sun,19,376
All,27,1158
