# Reshaping[33]
<h3 style="font-family:Courier;font-size:10px;">Converted for Python3</h3>

In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [2]:
dframe1 = DataFrame(np.arange(8).reshape(2,4),
                   index=pd.Index(['LA','SF'],name='city'),
                   columns=pd.Index(['A','B','C','D'],name='letter'))

In [3]:
dframe1

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [4]:
#Use stack to pivot columns into rows
dframe_st = dframe1.stack()
dframe_st

city  letter
LA    A         0
      B         1
      C         2
      D         3
SF    A         4
      B         5
      C         6
      D         7
dtype: int32

In [5]:
dframe_st.unstack()

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [6]:
dframe_st.unstack('letter')

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [7]:
dframe_st.unstack('city')

city,LA,SF
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,5
C,2,6
D,3,7


In [8]:
ser1 = Series([0,1,2],index=['Q','X','Y'])
ser2 = Series([4,5,6],index=['X','Y','Z'])
dframe = pd.concat([ser1,ser2],keys=['Alpha','Beta'])
dframe

Alpha  Q    0
       X    1
       Y    2
Beta   X    4
       Y    5
       Z    6
dtype: int64

In [9]:
dframe.unstack()

Unnamed: 0,Q,X,Y,Z
Alpha,0.0,1.0,2.0,
Beta,,4.0,5.0,6.0


In [10]:
#Automatically filter out null values by unstacking
dframe.unstack().stack()

Alpha  Q    0.0
       X    1.0
       Y    2.0
Beta   X    4.0
       Y    5.0
       Z    6.0
dtype: float64

In [11]:
dframe = dframe.unstack()

In [12]:
dframe

Unnamed: 0,Q,X,Y,Z
Alpha,0.0,1.0,2.0,
Beta,,4.0,5.0,6.0


In [13]:
#Stack always drops null values....
dframe.stack()

Alpha  Q    0.0
       X    1.0
       Y    2.0
Beta   X    4.0
       Y    5.0
       Z    6.0
dtype: float64

In [14]:
#Unless we pass in the dropna argument
dframe.stack(dropna=False)

Alpha  Q    0.0
       X    1.0
       Y    2.0
       Z    NaN
Beta   Q    NaN
       X    4.0
       Y    5.0
       Z    6.0
dtype: float64

# Pivoting

In [15]:
##Copying a dataframe written by instructor below
#Look into util.testing documentation for more info
import pandas.util.testing as tm; tm.N = 3
#Create an unpivoted function
def unpivot(frame):
    N, K = frame.shape
    
    data = {'value' : frame.values.ravel('F'),
           'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
                                   
    return DataFrame(data, columns=['date', 'variable', 'value'])
                                   
#Set the DataFrame we'll be using
dframe = unpivot(tm.makeTimeDataFrame())

In [16]:
dframe

Unnamed: 0,date,variable,value
0,2000-01-03,A,-0.018603
1,2000-01-04,A,-0.278774
2,2000-01-05,A,1.412917
3,2000-01-03,B,-0.170073
4,2000-01-04,B,0.071105
5,2000-01-05,B,-0.950416
6,2000-01-03,C,0.981459
7,2000-01-04,C,0.211507
8,2000-01-05,C,0.953566
9,2000-01-03,D,-0.445683


In [17]:
#Let's pivot the data
#Decide rows, columns and fille values sequentially
dframe_piv = dframe.pivot('date','variable','value')
dframe_piv

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,-0.018603,-0.170073,0.981459,-0.445683
2000-01-04,-0.278774,0.071105,0.211507,0.438972
2000-01-05,1.412917,-0.950416,0.953566,0.715738


# Duplicates in DataFrames[35]

In [18]:
#Finding duplicates in DataFrames
dframe = DataFrame({'key1': ['A']*2 + ['B']*3,
                   'key2':[2,2,2,3,3,]})
dframe


Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [19]:
#How can we determine whether or not rows are duplicated
dframe.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [20]:
#Get rid of duplicates
dframe.drop_duplicates()

Unnamed: 0,key1,key2
0,A,2
2,B,2
3,B,3


In [21]:
#Filter duplicates by a single column
dframe.drop_duplicates(['key1'])

Unnamed: 0,key1,key2
0,A,2
2,B,2


In [22]:
#Show original
dframe

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [23]:
#By default first value is taken from duplicates
#Last value can be taken instead
##Method "take_last" is deprecated, try keep='last'
dframe.drop_duplicates(['key1'],take_last=True)



Unnamed: 0,key1,key2
1,A,2
4,B,3


In [24]:
#Below is the new, non-deprecated method
#Again, keeps the last row instance
dframe.drop_duplicates(['key1'],keep='last')

Unnamed: 0,key1,key2
1,A,2
4,B,3


# Mapping[36]

## Mapping will allow us to add columns

In [25]:
dframe = DataFrame({'city':['Alma','Brian Head','Fox Park'],
                   'altitude':[3158,3000,2762]})
dframe

Unnamed: 0,altitude,city
0,3158,Alma
1,3000,Brian Head
2,2762,Fox Park


In [26]:
#Add a column for each state the city is in using mapping
state_map = {'Alma':'Colorado','Brian Head':'Utah',
             'Fox Park':'Wyoming'}

In [27]:
#employ mapping
dframe['state'] = dframe['city'].map(state_map)
dframe

Unnamed: 0,altitude,city,state
0,3158,Alma,Colorado
1,3000,Brian Head,Utah
2,2762,Fox Park,Wyoming


# Replace Values[37]

In [28]:
ser1 = Series([1,2,3,4,1,2,3,4])
ser1

0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64

In [29]:
#!!Select value you want replaced, and the new value to replace it with
ser1.replace(1,np.nan)

0    NaN
1    2.0
2    3.0
3    4.0
4    NaN
5    2.0
6    3.0
7    4.0
dtype: float64

In [30]:
#We can input lists for multiple entries
ser1.replace([1,4],[100,400])

0    100
1      2
2      3
3    400
4    100
5      2
6      3
7    400
dtype: int64

In [31]:
#Input dictionary, key will be replaced by value
ser1.replace({4:np.nan})

0    1.0
1    2.0
2    3.0
3    NaN
4    1.0
5    2.0
6    3.0
7    NaN
dtype: float64