## Pandas - Data Preparation and Cleaning
### Handling Missing Values

In [2]:
import pandas as pd
import numpy  as np

In [2]:
np.random.seed(321)
df = pd.DataFrame(np.random.randint(1, 100, (5, 3)), 
                  index=['a', 'c', 'e', 'f', 'h'],
                  columns=['one', 'two', 'three'])
df

Unnamed: 0,one,two,three
a,27,32,42
c,73,18,41
e,27,89,73
f,84,86,2
h,28,27,62


In [3]:
df['four'] = 'abc'
df['five'] = df['one'] > 50
df['six'] =  pd.Timestamp('20190101')
df

Unnamed: 0,one,two,three,four,five,six
a,27,32,42,abc,False,2019-01-01
c,73,18,41,abc,True,2019-01-01
e,27,89,73,abc,False,2019-01-01
f,84,86,2,abc,True,2019-01-01
h,28,27,62,abc,False,2019-01-01


In [4]:
df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df2

Unnamed: 0,one,two,three,four,five,six
a,27.0,32.0,42.0,abc,False,2019-01-01
b,,,,,,NaT
c,73.0,18.0,41.0,abc,True,2019-01-01
d,,,,,,NaT
e,27.0,89.0,73.0,abc,False,2019-01-01
f,84.0,86.0,2.0,abc,True,2019-01-01
g,,,,,,NaT
h,28.0,27.0,62.0,abc,False,2019-01-01


In [5]:
df2['one']

a    27.0
b     NaN
c    73.0
d     NaN
e    27.0
f    84.0
g     NaN
h    28.0
Name: one, dtype: float64

In [6]:
pd.isna(df2['one'])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [7]:
pd.isnull(df2['one'])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [8]:
pd.notna(df2['one'])

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool

In [9]:
df2

Unnamed: 0,one,two,three,four,five,six
a,27.0,32.0,42.0,abc,False,2019-01-01
b,,,,,,NaT
c,73.0,18.0,41.0,abc,True,2019-01-01
d,,,,,,NaT
e,27.0,89.0,73.0,abc,False,2019-01-01
f,84.0,86.0,2.0,abc,True,2019-01-01
g,,,,,,NaT
h,28.0,27.0,62.0,abc,False,2019-01-01


In [10]:
df2.isna()

Unnamed: 0,one,two,three,four,five,six
a,False,False,False,False,False,False
b,True,True,True,True,True,True
c,False,False,False,False,False,False
d,True,True,True,True,True,True
e,False,False,False,False,False,False
f,False,False,False,False,False,False
g,True,True,True,True,True,True
h,False,False,False,False,False,False


In [12]:
df2.isnull()

Unnamed: 0,one,two,three,four,five,six
a,True,True,True,True,True,True
b,True,True,True,True,True,True
c,True,True,True,True,True,True
d,True,True,True,True,True,True
e,True,True,True,True,True,True
f,True,True,True,True,True,True
g,True,True,True,True,True,True
h,True,True,True,True,True,True


In [13]:
df2['one'] 

a    27.0
b     NaN
c    73.0
d     NaN
e    27.0
f    84.0
g     NaN
h    28.0
Name: one, dtype: float64

In [14]:
df2['one'].sum(), df2['one'].prod()

(239.0, 125166384.0)

In [15]:
df2['one'].cumsum()

a     27.0
b      NaN
c    100.0
d      NaN
e    127.0
f    211.0
g      NaN
h    239.0
Name: one, dtype: float64

In [16]:
df2['one'].mean()

47.8

In [17]:
df2['one'].mean(skipna=False)

nan

### Filling missing values

In [None]:
df2

In [None]:
df2.fillna(0)

In [None]:
 df2['one']

In [None]:
df2['one'].fillna('missing')

In [None]:
df2

In [None]:
df2.fillna(method='pad')  #ffill

In [None]:
df2.ffill()

In [None]:
df2

In [None]:
df2.fillna(method='bfill') 

In [None]:
df2.bfill()

In [None]:
df2

In [None]:
df2.fillna(df.mean())

In [None]:
df2['five']

In [None]:
df2['five'].sum(), df2['five'].count(), df2['five'].mean()

In [None]:
df2

In [None]:
df.mean()

In [None]:
df.mean()['one':'three']

In [None]:
df2.fillna(df.mean()['one':'three'])

### dropna

In [None]:
df2

In [None]:
df2.dropna()

In [None]:
df2.dropna(axis=0)

In [None]:
df2.dropna(axis=1)

In [None]:
df2

### Removing Duplicates

In [None]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 2, 4, 4]})
data

In [None]:
data.duplicated()

In [None]:
data.drop_duplicates()

In [None]:
data['v1'] = range(7)
data

In [None]:
data.drop_duplicates(['k1'])

In [None]:
data.drop_duplicates(['k1'], keep = 'last')

In [None]:
data

In [None]:
data.drop_duplicates(['k1', 'k2'], keep = 'last')

### Data Transformation with Function Mapping

In [None]:
data = pd.DataFrame({'course': ['cs1', 'cs2', 'CS1', 'Cs2'],
                     'grade': ['A', 'B', 'A-', 'B+']})
data

In [None]:
course_to_name = {'cs1' : 'Python', 'cs2' : 'Java'}

In [None]:
data.course

In [None]:
lc = data.course.str.lower()
lc

In [None]:
data['language'] = lc.map(course_to_name)
data

In [None]:
# or
data['language'] = data['course'].map(
                        lambda x : course_to_name[x.lower()])
data

### Replacing Values

In [7]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [4]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [5]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [8]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [9]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [10]:
data = pd.DataFrame({'course': ['cs1', 'cs2', 'CS1', 'Cs2'],
                     'grade': ['A', 'B', 'A-', 'B+']})
data

Unnamed: 0,course,grade
0,cs1,A
1,cs2,B
2,CS1,A-
3,Cs2,B+


In [None]:
data.replace({'A': 100, 'A-' : 90, 'B+': 80, 'B': 70}, inplace=True)
data

In [None]:
data.replace([100,90], method='bfill')

### Renaming Axis Indexes

In [None]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'Massachusetts'],
                    columns=['one', 'two', 'three', 'four'])
data

In [None]:
data.index = data.index.map(lambda x: x[:2].upper())
data

In [None]:
data.rename(index=str.title, columns=str.upper)

In [None]:
data

In [None]:
data.rename(index={'OH': 'Ohio', 'MA': 'Mass'},
            columns={'two': '2', 'four': '4'})

In [None]:
data

In [None]:
data.rename(index={'OH': 'Ohio', 'MA': 'Mass'},
            columns={'two': '2', 'four': '4'}, inplace=True)
data

### Discretization and Binning
 - pandas.cut() - Bin values into discrete intervals.

In [None]:
ages = [37, 22, 25, 27, 21, 23, 20, 31, 61, 45, 41, 32]

In [None]:
# Distribute into 3 bins

cats = pd.cut(ages, 3)
cats

In [None]:
# Specify bin ranges

bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

In [None]:
cats.codes

In [None]:
cats.categories

In [None]:
pd.value_counts(cats)

In [None]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

In [None]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

In [None]:
x,y = pd.cut(ages, bins, labels=group_names, retbins=True)
x

In [None]:
y

In [None]:
x.categories

In [None]:
x.to_list()

In [None]:
pd.DataFrame({'age': ages, 'class':pd.cut(ages, bins, labels=group_names)})

### qcut - Quantile-based discretization function
 - Discretize variable into equal-sized buckets based on rank or based on sample quantiles

In [None]:
np.random.seed(123)

data = pd.Series(np.random.randn(1000))  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
pd.value_counts(cats)

In [None]:
data.describe()

In [None]:
# Alternately, array of quantiles

cats = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.])
pd.value_counts(cats)

In [None]:
# deciles

cats = pd.qcut(data, 10)
pd.value_counts(cats)

In [None]:
cats = pd.qcut(data, 4, labels=['A', 'B','C','D'])
pd.value_counts(cats)

In [None]:
np.random.seed(123)

df = pd.DataFrame(np.random.randint(40, 101, (100, 3)), 
                  columns=['Q1', 'Q2', 'Q3'])
df.head()

In [None]:
df['Average'] = np.round(df.mean(axis=1))
df.head()

In [None]:
df['Average']

In [None]:
cats = pd.cut(df['Average'], bins=[0,40,60,80,100], labels=['D', 'C','B','A'])
pd.value_counts(cats, sort=False)

In [None]:
df['Grade'] = cats
df

In [None]:
df.sort_values(by='Average', ascending=False)