In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

## Explode dataframe

### 1. Explode a list to multiple columns

- First convert the `pd.Series` (i.e. the dataframe column) to a list; 
- Then build a dataframe from the list

In [3]:
data1 = {'A': ['a','b','c','d'], 'B':[[1,1],[2,2,2], [3,], [4,4]]}
df1 = pd.DataFrame(data1)
df1

Unnamed: 0,A,B
0,a,"[1, 1]"
1,b,"[2, 2, 2]"
2,c,[3]
3,d,"[4, 4]"


In [4]:
temp1 = pd.DataFrame(df1['B'].to_list(), columns=['B_1', 'B_2', 'B_3'])
temp1

Unnamed: 0,B_1,B_2,B_3
0,1,1.0,
1,2,2.0,2.0
2,3,,
3,4,4.0,


In [5]:
pd.concat([df1, temp1], axis=1).drop('B', axis=1)

Unnamed: 0,A,B_1,B_2,B_3
0,a,1,1.0,
1,b,2,2.0,2.0
2,c,3,,
3,d,4,4.0,


In [6]:
df2 = pd.DataFrame([{'name': 'Han', 'hobbies': 'Art;Soccer;Writing'}, 
                    {'name': 'Leia', 'hobbies': 'Art;Baking;Golf;Singing'},
                    {'name': 'Luke', 'hobbies': 'Baking;Writing'}])
df2

Unnamed: 0,name,hobbies
0,Han,Art;Soccer;Writing
1,Leia,Art;Baking;Golf;Singing
2,Luke,Baking;Writing


If the column is `string` type, first use the string method `str.split()` to split the string to a list. Use `expand=True` to expand the list to multiple columns.

In [7]:
temp2 = df2['hobbies'].str.split(';', expand=True)
temp2

Unnamed: 0,0,1,2,3
0,Art,Soccer,Writing,
1,Art,Baking,Golf,Singing
2,Baking,Writing,,


In [8]:
pd.concat([df2, temp2], axis=1)

Unnamed: 0,name,hobbies,0,1,2,3
0,Han,Art;Soccer;Writing,Art,Soccer,Writing,
1,Leia,Art;Baking;Golf;Singing,Art,Baking,Golf,Singing
2,Luke,Baking;Writing,Baking,Writing,,


### 2. Explode a list to multiple rows

Use `explode()` method. This method can only explode one column at a time

In [9]:
df1.explode('B')

Unnamed: 0,A,B
0,a,1
0,a,1
1,b,2
1,b,2
1,b,2
2,c,3
3,d,4
3,d,4


### 3. Explode a dictionary to multiple columns

If the datatype is dictionary, we can use two methods to expand the column to multiple ones.

1. Use the same method as the datatype is a list. Convert the column `pd.Series` to a list and then create a dataframe
2. Use `.apply(pd.Series)`. It seems that this method is slow

In [10]:
data3 = {'A': ['a','b','c','d'], 'B': [{'m1':0, 'm2':0}, 
                                       {'m0':1, 'm1':1, 'm2':1}, 
                                       {'m1':2, 'm2':2, 'm3':2}, 
                                       {'m1':3, 'm2':3}]}
df3 = pd.DataFrame(data3)
df3

Unnamed: 0,A,B
0,a,"{'m1': 0, 'm2': 0}"
1,b,"{'m0': 1, 'm1': 1, 'm2': 1}"
2,c,"{'m1': 2, 'm2': 2, 'm3': 2}"
3,d,"{'m1': 3, 'm2': 3}"


In [11]:
temp31 = pd.DataFrame(df3['B'].to_list())
temp31

Unnamed: 0,m1,m2,m0,m3
0,0,0,,
1,1,1,1.0,
2,2,2,,2.0
3,3,3,,


In [12]:
temp3 = df3['B'].apply(pd.Series)  # it seems that this method is very slow
temp3

Unnamed: 0,m1,m2,m0,m3
0,0.0,0.0,,
1,1.0,1.0,1.0,
2,2.0,2.0,,2.0
3,3.0,3.0,,


## Handling Missing Value

### Filtering out missing value

In [13]:
from numpy import nan as NA

In [14]:
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [15]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [16]:
data[data.isnull()]

1   NaN
3   NaN
dtype: float64

In [17]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
data.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,True
2,True,True,True
3,True,False,False


In [19]:
data.notnull()

Unnamed: 0,0,1,2
0,True,True,True
1,True,False,False
2,False,False,False
3,False,True,True


```
data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
```

In [20]:
cleaned = data.dropna() # how='any' by default
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [21]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [22]:
data.dropna(axis=1, how='all') # it drops any column that have no meaninful values. Not common

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [23]:
data.dropna(thresh=2)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
3,,6.5,3.0


### Filling missing value

```
df.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None)
```
- `value`: scalar, dict, Series or Dataframe
- `method`: `{'backfill', 'bfill', 'pad', 'ffill', None}`, default `None`

In [24]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [25]:
data.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.5,0.0
2,,0.5,0.0
3,,6.5,3.0


In [26]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.204708,0.478943,-0.519439
1,-0.55573,1.965781,1.393406
2,0.092908,,0.769023
3,1.246435,,-1.296221
4,0.274992,,
5,0.886429,,


In [27]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.204708,0.478943,-0.519439
1,-0.55573,1.965781,1.393406
2,0.092908,1.965781,0.769023
3,1.246435,1.965781,-1.296221
4,0.274992,,-1.296221
5,0.886429,,-1.296221


## Replacing Values

In [28]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [29]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [30]:
# can use lists. If both are lists, the length should be the same.
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [31]:
# can use a dictionary
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## Removing duplicates

In [32]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [33]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

```
data.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
```
- `subset`: column names, optional
- `keep`: {`'first', 'last', False`}

In [34]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [35]:
data.drop_duplicates(subset='k1')

Unnamed: 0,k1,k2
0,one,1
1,two,1
