 # Data Cleaning

 Data cleaning is one of the most important and time consuming process of any data analysis project. Such tasks are often reported to take up 80% or more of an analyst's time.

 ### Handling Missing Data

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
ser1 = pd.Series([1.2,4.1,None,3.8,np.nan])
ser1

0    1.2
1    4.1
2    NaN
3    3.8
4    NaN
dtype: float64

In [3]:
ser1.isna()

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [4]:
ser1.dropna()

0    1.2
1    4.1
3    3.8
dtype: float64

In [5]:
ser1.fillna(value=0)

0    1.2
1    4.1
2    0.0
3    3.8
4    0.0
dtype: float64

In [6]:
ser1.fillna(method="ffill")

0    1.2
1    4.1
2    4.1
3    3.8
4    3.8
dtype: float64

In [7]:
ser1.fillna(value=ser1.mean()).round(1)

0    1.2
1    4.1
2    3.0
3    3.8
4    3.0
dtype: float64

In [8]:
# filter only nan values
ser1[ser1.isna()]

2   NaN
4   NaN
dtype: float64

In [9]:
# same as dropna
ser1[ser1.notna()]

0    1.2
1    4.1
3    3.8
dtype: float64

In [10]:
df1 = pd.DataFrame([[1.2,2.1,4.6],[np.nan,None,np.nan],[6.2,np.nan,9.1],[np.nan,8.2,np.nan]])
df1

Unnamed: 0,0,1,2
0,1.2,2.1,4.6
1,,,
2,6.2,,9.1
3,,8.2,


In [11]:
# Drops rows by default
df1.dropna()

Unnamed: 0,0,1,2
0,1.2,2.1,4.6


In [12]:
df1.dropna(axis=1)

0
1
2
3


In [13]:
df1.dropna(how="all")

Unnamed: 0,0,1,2
0,1.2,2.1,4.6
2,6.2,,9.1
3,,8.2,


In [14]:
df1.dropna(axis=1, how="all")

Unnamed: 0,0,1,2
0,1.2,2.1,4.6
1,,,
2,6.2,,9.1
3,,8.2,


In [15]:
df2 = pd.DataFrame(np.random.standard_normal((7,3)))
df2.iloc[:2,2] = np.nan
df2.iloc[:4,1] = np.nan
df2

Unnamed: 0,0,1,2
0,-0.210651,,
1,-0.215459,,
2,0.797443,,-0.078286
3,1.201123,,0.388459
4,0.866801,0.361565,-0.31785
5,-0.787326,1.027706,-1.641184
6,-0.002316,0.691208,0.818731


In [16]:
df2.dropna()

Unnamed: 0,0,1,2
4,0.866801,0.361565,-0.31785
5,-0.787326,1.027706,-1.641184
6,-0.002316,0.691208,0.818731


 We can specify a threshold, max number of rows or cols to be dropped:

In [17]:
# drop 2 rows max
df2.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.797443,,-0.078286
3,1.201123,,0.388459
4,0.866801,0.361565,-0.31785
5,-0.787326,1.027706,-1.641184
6,-0.002316,0.691208,0.818731


 ### Filling Missing Data

 Rather than filtering out missing data (and potentially discarding other data along with it), you may want to fill in the “holes” in any number of ways.

In [18]:
# Don't mutate the original
# df2.columns = list("abc")
# df2

# Use this instead
df3 = df2.set_axis(list("abc"),axis=1).round(2)
df3

Unnamed: 0,a,b,c
0,-0.21,,
1,-0.22,,
2,0.8,,-0.08
3,1.2,,0.39
4,0.87,0.36,-0.32
5,-0.79,1.03,-1.64
6,-0.0,0.69,0.82


 Fill with column specific values:

In [19]:
df3.fillna({"a":0.2,"b":0.4,"c":0.8})

Unnamed: 0,a,b,c
0,-0.21,0.4,0.8
1,-0.22,0.4,0.8
2,0.8,0.4,-0.08
3,1.2,0.4,0.39
4,0.87,0.36,-0.32
5,-0.79,1.03,-1.64
6,-0.0,0.69,0.82


In [20]:
# Fill backwards:
df3.fillna(method="bfill")

Unnamed: 0,a,b,c
0,-0.21,0.36,-0.08
1,-0.22,0.36,-0.08
2,0.8,0.36,-0.08
3,1.2,0.36,0.39
4,0.87,0.36,-0.32
5,-0.79,1.03,-1.64
6,-0.0,0.69,0.82


In [21]:
# Fill across columns
df3.fillna(method="ffill", axis=1)

Unnamed: 0,a,b,c
0,-0.21,-0.21,-0.21
1,-0.22,-0.22,-0.22
2,0.8,0.8,-0.08
3,1.2,1.2,0.39
4,0.87,0.36,-0.32
5,-0.79,1.03,-1.64
6,-0.0,0.69,0.82


In [22]:
# Can also have a threshold:
df3.fillna(method="ffill", axis=1, limit=1)

Unnamed: 0,a,b,c
0,-0.21,-0.21,
1,-0.22,-0.22,
2,0.8,0.8,-0.08
3,1.2,1.2,0.39
4,0.87,0.36,-0.32
5,-0.79,1.03,-1.64
6,-0.0,0.69,0.82


In [23]:
df3.fillna(df3.mean(axis=0)).round(2)

Unnamed: 0,a,b,c
0,-0.21,0.69,-0.17
1,-0.22,0.69,-0.17
2,0.8,0.69,-0.08
3,1.2,0.69,0.39
4,0.87,0.36,-0.32
5,-0.79,1.03,-1.64
6,-0.0,0.69,0.82


 ### Data Transformation

In [24]:
df4 = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"], "k2": [1,1,2,3,3,4,4]})
df4

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [25]:
# True if record is a duplicate
df4.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [26]:
df4.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [27]:
df4["v1"] = np.arange(7)
df4

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [28]:
df4.drop_duplicates(subset="k1")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [29]:
df4.drop_duplicates(subset=["k1"], keep="last")

Unnamed: 0,k1,k2,v1
4,one,3,4
6,two,4,6


In [30]:
meat_df = pd.DataFrame({"food": ["bacon","pulled pork","bacon","pastrami","corned beef","bacon","pastrami","honey ham","nova lox"], "ounces": np.abs((np.random.standard_normal(9)*8).round(1))})
meat_df

Unnamed: 0,food,ounces
0,bacon,8.7
1,pulled pork,2.5
2,bacon,1.5
3,pastrami,5.2
4,corned beef,3.9
5,bacon,3.2
6,pastrami,7.4
7,honey ham,5.5
8,nova lox,10.4


 Let's say you want to map each meat with the animal it came from:

In [31]:
# Method #1:
meat_to_ani = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

meat_df["animal"] = meat_df["food"].map(meat_to_ani)
meat_df

Unnamed: 0,food,ounces,animal
0,bacon,8.7,pig
1,pulled pork,2.5,pig
2,bacon,1.5,pig
3,pastrami,5.2,cow
4,corned beef,3.9,cow
5,bacon,3.2,pig
6,pastrami,7.4,cow
7,honey ham,5.5,pig
8,nova lox,10.4,salmon


In [32]:
# Method #2
meat_df["food"].map(lambda food: meat_to_ani[food])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [33]:
def food_mapper(food):
  animal_prods = {
    "pig": ["bacon", "pulled pork", "honey ham"],
    "cow": ["pastrami", "corned beef"],
    "salmon": ["nova lox"]
  }

  for ani, foods in animal_prods.items():
    if food in foods:
      return ani

In [34]:
meat_df["food"].map(food_mapper)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [35]:
s1 = pd.Series([12.1, 8, -999, -1000, 0.8, -999])
s1

0      12.1
1       8.0
2    -999.0
3   -1000.0
4       0.8
5    -999.0
dtype: float64

In [36]:
# Value -999 might be a sentinel:
s1.replace(-999, np.nan)

0      12.1
1       8.0
2       NaN
3   -1000.0
4       0.8
5       NaN
dtype: float64

In [37]:
s1.replace([-999,-1000], np.nan)

0    12.1
1     8.0
2     NaN
3     NaN
4     0.8
5     NaN
dtype: float64

In [38]:
s1.replace([-999,-1000], [np.nan, 0.0])

0    12.1
1     8.0
2     NaN
3     0.0
4     0.8
5     NaN
dtype: float64

In [39]:
# Can also provide a dictionary
s1.replace({-999: np.nan, 1000: 0.0})

0      12.1
1       8.0
2       NaN
3   -1000.0
4       0.8
5       NaN
dtype: float64

 ### Renaming Axes

In [40]:
df5 = pd.DataFrame(
  data=(np.random.standard_normal((3,4))*8).round(2),
  columns=list("abcd"),
  index=["Ohio", "Colorado", "New York"]
)
df5

Unnamed: 0,a,b,c,d
Ohio,-0.54,3.32,-1.39,-6.31
Colorado,-5.45,0.46,-6.1,3.12
New York,1.92,-0.29,-28.39,7.22


In [41]:
df5.index = df5.index.map(lambda city: city[:4].upper().strip())
df5

Unnamed: 0,a,b,c,d
OHIO,-0.54,3.32,-1.39,-6.31
COLO,-5.45,0.46,-6.1,3.12
NEW,1.92,-0.29,-28.39,7.22


In [42]:
# If you don't want to mutate the original:
df5.rename(mapper=lambda city: city.title(), axis=0)

Unnamed: 0,a,b,c,d
Ohio,-0.54,3.32,-1.39,-6.31
Colo,-5.45,0.46,-6.1,3.12
New,1.92,-0.29,-28.39,7.22


In [43]:
# Change both index and column together
df5.rename(
  index=lambda i: i.title(),
  columns=lambda c: c.upper()
)

Unnamed: 0,A,B,C,D
Ohio,-0.54,3.32,-1.39,-6.31
Colo,-5.45,0.46,-6.1,3.12
New,1.92,-0.29,-28.39,7.22


In [44]:
df5.rename(columns={"a": "foo", "c": "fizz"}, index={"NEW": "GOLD"}).rename(lambda i: i.title(), axis=0)

Unnamed: 0,foo,b,fizz,d
Ohio,-0.54,3.32,-1.39,-6.31
Colo,-5.45,0.46,-6.1,3.12
Gold,1.92,-0.29,-28.39,7.22


 ### Discretization & Binning

 Continuous data is often discretized or otherwise separated into 'bins' for analysis. Eg: [1.2, 3.2, 2.8] => (1, 4] -> range len: 3

 **Note**: Excludes lower and includes upper by default.

In [45]:
ages = [28, 20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32, 67]
bins = [18,25,35,60,100]

age_cate = pd.cut(ages,bins)
age_cate

[(25, 35], (18, 25], (18, 25], (18, 25], (25, 35], ..., (60, 100], (35, 60], (35, 60], (25, 35], (60, 100]]
Length: 14
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [46]:
type(age_cate)

pandas.core.arrays.categorical.Categorical

In [47]:
age_cate.codes

array([1, 0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1, 3], dtype=int8)

In [48]:
age_cate.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [49]:
# Frequencies of categorical data in a Series
age_cate.value_counts()
# OR
pd.value_counts(age_cate)

(18, 25]     5
(25, 35]     4
(35, 60]     3
(60, 100]    2
dtype: int64

 **Note**: In the string representation of an interval, a parenthesis means that side is open (exclusive), while the square bracket means it's closed (inclusive).

 (18, 25] => 18 isn't included, but 25 is.

In [50]:
# Right side becomes exclusive:
pd.cut(ages,bins,right=False) # [18, 25)...

[[25, 35), [18, 25), [18, 25), [25, 35), [25, 35), ..., [60, 100), [35, 60), [35, 60), [25, 35), [60, 100)]
Length: 14
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [51]:
age_group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]

age_cat = pd.cut(ages, bins, labels=age_group_names, right=False)
age_cat

['YoungAdult', 'Youth', 'Youth', 'YoungAdult', 'YoungAdult', ..., 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult', 'Senior']
Length: 14
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [52]:
# Returns pd.Index
age_cat.categories

Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')

In [58]:
uni = np.random.uniform(size=100)
# Gives 4 equally ranage cuts:
pd.cut(uni,4,precision=2).value_counts()
# precision=2, limits the precision to 2 decimal places

(0.023, 0.27]    31
(0.27, 0.51]     21
(0.51, 0.75]     23
(0.75, 0.99]     25
dtype: int64

In [59]:
pd.qcut(uni,4,precision=2).value_counts()

(0.014, 0.19]    25
(0.19, 0.47]     25
(0.47, 0.75]     25
(0.75, 0.99]     25
dtype: int64

In [61]:
# Using qcuts gives 4 equally allocated ranges:
pd.qcut(np.random.standard_normal(1000),4,precision=2).value_counts()

(-2.8, -0.75]      250
(-0.75, -0.039]    250
(-0.039, 0.6]      250
(0.6, 3.25]        250
dtype: int64

 **Note**: `.qcut` represents quartile cut

 Similar to `.cut`, we can pass custom quartiles from 0 to 1:

In [62]:
pd.qcut(np.random.standard_normal(1000), [0, 0.2, 0.5, 0.7, 1], precision=2).value_counts()

(-3.4, -0.86]     200
(-0.86, 0.017]    300
(0.017, 0.57]     200
(0.57, 2.78]      300
dtype: int64

 **Note**: These discretization functions are useful for quartile & group analysis.

 ### Detecting & Filtering Outliers

 Find values in one of the columns exceeding 3 in absolute value:

In [66]:
df6 = pd.DataFrame(np.random.standard_normal((1000,4)))
df6.head()

Unnamed: 0,0,1,2,3
0,0.990002,0.596223,-0.128379,0.100184
1,1.450916,-0.173325,1.428712,-0.233186
2,1.13382,-0.320932,-0.009519,1.313134
3,-1.585892,-0.067293,0.918519,-2.333441
4,0.061174,-0.394256,1.198163,0.099848


In [67]:
df6.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.035417,-0.017417,-0.007619,0.059124
std,1.006779,1.038202,0.989057,0.99424
min,-3.120093,-3.899576,-3.320554,-2.618011
25%,-0.777111,-0.701278,-0.640088,-0.614028
50%,-0.019539,-0.026583,-0.030902,0.025094
75%,0.654993,0.66802,0.626231,0.749863
max,2.966016,3.822949,2.63137,3.800494


 Find values in one of the columns exceeding 3 in absolute value:

In [68]:
col = df6[2]
col[col.abs() > 3]

129   -3.004406
145   -3.320554
Name: 2, dtype: float64

 Select the entire row that satisfy the above condition:

In [70]:
df6[(df6.abs() > 3).any(axis=1)]
# The conditional gives us a boolean df, then any returns True if any True found in a row (axis=1), then filter the data.

Unnamed: 0,0,1,2,3
40,-0.665151,-3.899576,1.637669,1.172745
46,-0.835341,-0.007824,-0.71414,3.064668
55,-0.915547,-0.579042,1.124267,3.10583
129,-0.231221,0.865852,-3.004406,-0.014779
145,-2.070884,-0.158795,-3.320554,-1.064087
216,0.212932,0.630559,-1.281067,3.062019
314,-0.145983,3.201613,-0.987171,0.266442
441,0.119219,3.155594,1.098378,-1.861103
644,0.306656,3.822949,-1.33116,-2.209439
711,0.463785,0.233165,-2.038178,3.2235


In [71]:
# Return rows where all elements, in that row, is greater than 1.1 in abs value:
df6[(df6.abs() > 1.1).all(axis=1)]

Unnamed: 0,0,1,2,3
81,1.283594,1.426499,1.118452,-1.30326
179,1.260835,1.660628,-1.138708,-1.171844
362,-1.347389,1.842733,-1.628728,-2.300129
585,1.611829,1.243233,1.411001,-1.220186
716,1.26409,-1.154082,-1.695001,-1.201512
756,2.153797,1.388727,1.774277,-1.155847
786,-2.377738,-1.995753,-1.682248,1.456932
898,-1.267168,1.467145,1.410619,-1.463256


 Cap the values to -3 to +3

In [72]:
df7 = df6.copy()
# np.sign => (-1 if x < 0), (0 if x == 0), (1 if x > 0)
df7[df7.abs() > 3] = np.sign(df7) * 3

In [73]:
df7.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.035297,-0.017686,-0.007294,0.057651
std,1.006418,1.031378,0.98802,0.989496
min,-3.0,-3.0,-3.0,-2.618011
25%,-0.777111,-0.701278,-0.640088,-0.614028
50%,-0.019539,-0.026583,-0.030902,0.025094
75%,0.654993,0.66802,0.626231,0.749863
max,2.966016,3.0,2.63137,3.0


In [74]:
np.sign(df7).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,1.0
1,1.0,-1.0,1.0,-1.0
2,1.0,-1.0,-1.0,1.0
3,-1.0,-1.0,1.0,-1.0
4,1.0,-1.0,1.0,1.0


 ### Permutation & Random Sampling

In [81]:
df8 = pd.DataFrame(np.arange(5*7).reshape((5,7)))
df8

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [82]:
# Same as row length
row_sampler = np.random.permutation(5)
row_sampler

array([4, 1, 2, 0, 3])

In [83]:
df8.take(row_sampler)
# OR
df8.iloc[row_sampler]

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27


In [84]:
col_sampler = np.random.permutation(7)
col_sampler

array([4, 6, 2, 5, 1, 3, 0])

In [85]:
df8.take(col_sampler, axis=1)
#OR
df8[col_sampler]

Unnamed: 0,4,6,2,5,1,3,0
0,4,6,2,5,1,3,0
1,11,13,9,12,8,10,7
2,18,20,16,19,15,17,14
3,25,27,23,26,22,24,21
4,32,34,30,33,29,31,28


In [86]:
# Mixing it up
df8.take(row_sampler).take(col_sampler, axis=1)

Unnamed: 0,4,6,2,5,1,3,0
4,32,34,30,33,29,31,28
1,11,13,9,12,8,10,7
2,18,20,16,19,15,17,14
0,4,6,2,5,1,3,0
3,25,27,23,26,22,24,21


 Select random subset without replacement (no same rows):

In [89]:
df8.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34


In [90]:
ser2 = pd.Series([1,0,-4,2,7])
ser2

0    1
1    0
2   -4
3    2
4    7
dtype: int64

 Random subset with replacement, pass replace=True:

In [91]:
ser2.sample(n=10, replace=True)

1    0
4    7
4    7
1    0
0    1
2   -4
3    2
3    2
1    0
2   -4
dtype: int64

 ### Dummy Variables

In [92]:
df9 = pd.DataFrame({"key": ["a", "c", "b", "b", "a", "a"], "data": np.random.standard_normal(6)})
df9

Unnamed: 0,key,data
0,a,-0.581197
1,c,-0.725189
2,b,-1.874413
3,b,-1.323828
4,a,0.462608
5,a,0.008121


In [93]:
pd.get_dummies(df9["key"])

Unnamed: 0,a,b,c
0,1,0,0
1,0,0,1
2,0,1,0
3,0,1,0
4,1,0,0
5,1,0,0


In [94]:
dummies = pd.get_dummies(df9["key"], prefix="key")
# Cannot join with a Series
df9[["data"]].join(dummies)

Unnamed: 0,data,key_a,key_b,key_c
0,-0.581197,1,0,0
1,-0.725189,0,0,1
2,-1.874413,0,1,0
3,-1.323828,0,1,0
4,0.462608,1,0,0
5,0.008121,1,0,0


In [95]:
movies = pd.read_table("datasets/movielens/movies.dat", sep="::", engine="python", header=None, names=["movie_id", "title", "genre"])
# !cat "datasets/movielens/movies.dat"
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [96]:
mov_dummies = movies["genre"].str.get_dummies(sep="|")
mov_dummies.iloc[:5, :4]

Unnamed: 0,Action,Adventure,Animation,Children's
0,0,0,1,1
1,0,1,0,1
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [104]:
movs_df = movies.join(mov_dummies.add_prefix("Genre_"))
movs_df = movs_df.rename(mapper=lambda name: name.lower(), axis=1)

In [105]:
movs_df.drop("genre",axis=1).head()

Unnamed: 0,movie_id,title,genre_action,genre_adventure,genre_animation,genre_children's,genre_comedy,genre_crime,genre_documentary,genre_drama,genre_fantasy,genre_film-noir,genre_horror,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_thriller,genre_war,genre_western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [106]:
vals = np.random.uniform(size=10)
print(vals)
test_bins = [0,0.2,0.4,0.6,0.8,1]
# pd.cut(vals,test_bins)

[0.37035502 0.29519062 0.39766179 0.25637147 0.28547708 0.93139144
 0.46677726 0.86824302 0.97318579 0.66482057]


In [107]:
pd.get_dummies(pd.cut(vals,test_bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0
5,0,0,0,0,1
6,0,0,1,0,0
7,0,0,0,0,1
8,0,0,0,0,1
9,0,0,0,1,0


In [108]:
df10 = pd.DataFrame({"flat": np.random.standard_normal(6).round(2), "sex": ["male", "male", "female", "male", "female", "female"]})
df10

Unnamed: 0,flat,sex
0,-1.56,male
1,0.71,male
2,1.75,female
3,0.56,male
4,1.61,female
5,-1.25,female


In [109]:
# df10["sex"].map({"female":0,"male":1})
# OR
pd.get_dummies(df10["sex"],prefix="sex")

Unnamed: 0,sex_female,sex_male
0,0,1
1,0,1
2,1,0
3,0,1
4,1,0
5,1,0


 **Note**: Generally, if you have k possible values for a categorical variable, in this case sex can be 2 possible values: male and female; we use k-1 dummy variables to represent it.

In [113]:
df10[["flat"]].join(pd.get_dummies(df10["sex"], prefix="sex").iloc[:, 1:])

Unnamed: 0,flat,sex_male
0,-1.56,1
1,0.71,1
2,1.75,0
3,0.56,1
4,1.61,0
5,-1.25,0


In [114]:
df11 = pd.concat([df9, df10], axis=1)
df11

Unnamed: 0,key,data,flat,sex
0,a,-0.581197,-1.56,male
1,c,-0.725189,0.71,male
2,b,-1.874413,1.75,female
3,b,-1.323828,0.56,male
4,a,0.462608,1.61,female
5,a,0.008121,-1.25,female


In [117]:
# If we pass the entire DataFrame, use drop_first to get that k-1 dummy variables
pd.get_dummies(df11, columns=["key", "sex"], drop_first=True)

Unnamed: 0,data,flat,key_b,key_c,sex_male
0,-0.581197,-1.56,0,0,1
1,-0.725189,0.71,0,1,1
2,-1.874413,1.75,1,0,0
3,-1.323828,0.56,1,0,1
4,0.462608,1.61,0,0,0
5,0.008121,-1.25,0,0,0


 ### Extensions Data Types

In [119]:
# nan is makes the entire Series dtype: float64 instead of int64
# Mainly because of backward compatibility reasons
pd.Series([1,2,3,np.nan])

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [120]:
# dtype=pd.Int64Dtype() or "Int64"
s1 = pd.Series([1,2,3,None],dtype="Int64")
s1

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [121]:
s1.dtype

Int64Dtype()

In [122]:
s1[s1.notna()]

0    1
1    2
2    3
dtype: Int64

In [123]:
s1[3] is pd.NA

True

In [124]:
df12 = pd.DataFrame({
  "a": [1,None,3,4],
  "b": ["one", "two", None, "four"],
  "c": [None, False, True, True]
})
df12

Unnamed: 0,a,b,c
0,1.0,one,
1,,two,False
2,3.0,,True
3,4.0,four,True


In [125]:
for t,col_name in zip(["Int64", "string", "boolean"], list("abc")):
  df12[col_name] = df12[col_name].astype(t)

df12

Unnamed: 0,a,b,c
0,1.0,one,
1,,two,False
2,3.0,,True
3,4.0,four,True


In [126]:
df12.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       3 non-null      Int64  
 1   b       3 non-null      string 
 2   c       3 non-null      boolean
dtypes: Int64(1), boolean(1), string(1)
memory usage: 204.0 bytes


 ### String Manipulation

In [127]:
a = "a,b,  vik"
b = [s.strip() for s in a.split(",")]
b

['a', 'b', 'vik']

In [128]:
"::".join(b)

'a::b::vik'

In [129]:
print(a.find(":")) # Returns -1 if not found
print(a.index(",")) # Throws a ValueError

-1
1


In [130]:
"vik" in a

True

In [131]:
# Counts the occurances of the substring
a.count("z")

0

In [132]:
a.replace(",", "||")

'a||b||  vik'

 ### RegEx

In [133]:
re.split(r"\s+", "foo    bar\t baz  \tqux")

['foo', 'bar', 'baz', 'qux']

 Create a regex object to use the same expression to many strings:

In [134]:
text = """Bob bob25@proton.me
Vik vik.negi@gmail.com
Robb robb-stark@winter.got
Ryan ryan_reynolds@hollywood.la"""

regex = re.compile(r"[\w\.\-\_]+@\w+\.\w{2,4}")
regex.findall(text)

['bob25@proton.me',
 'vik.negi@gmail.com',
 'robb-stark@winter.got',
 'ryan_reynolds@hollywood.la']

In [135]:
print(regex.sub("exposed", text))

Bob exposed
Vik exposed
Robb exposed
Ryan exposed


In [136]:
regex_str = r"([\w\.\-\_]+)@(\w+)\.(\w{2,4})"
sep_reg = re.compile(regex_str)

In [137]:
m = sep_reg.match("vikram.s.negi@proton.me")
m.groups()

('vikram.s.negi', 'proton', 'me')

In [138]:
sep_reg.findall(text)

[('bob25', 'proton', 'me'),
 ('vik.negi', 'gmail', 'com'),
 ('robb-stark', 'winter', 'got'),
 ('ryan_reynolds', 'hollywood', 'la')]

In [139]:
# Why no zero count? We may never know!
print(sep_reg.sub(r"=> username: \1, domain: \2, suffix: \3", text))

Bob => username: bob25, domain: proton, suffix: me
Vik => username: vik.negi, domain: gmail, suffix: com
Robb => username: robb-stark, domain: winter, suffix: got
Ryan => username: ryan_reynolds, domain: hollywood, suffix: la


In [140]:
name_mail_dict = {name: mail_id for name, mail_id in [re.split(r"\s+", t) for t in text.split("\n")]}
name_mail_dict

{'Bob': 'bob25@proton.me',
 'Vik': 'vik.negi@gmail.com',
 'Robb': 'robb-stark@winter.got',
 'Ryan': 'ryan_reynolds@hollywood.la'}

In [141]:
name_mail_dict["Wes"] = np.nan
mail_ser = pd.Series(name_mail_dict)
mail_ser

Bob                bob25@proton.me
Vik             vik.negi@gmail.com
Robb         robb-stark@winter.got
Ryan    ryan_reynolds@hollywood.la
Wes                            NaN
dtype: object

In [142]:
mail_ser.isna()

Bob     False
Vik     False
Robb    False
Ryan    False
Wes      True
dtype: bool

In [143]:
mail_ser.str.contains("gmail")

Bob     False
Vik      True
Robb    False
Ryan    False
Wes       NaN
dtype: object

In [144]:
matches = mail_ser.str.findall(regex_str).str[0]
matches

Bob                (bob25, proton, me)
Vik             (vik.negi, gmail, com)
Robb         (robb-stark, winter, got)
Ryan    (ryan_reynolds, hollywood, la)
Wes                                NaN
dtype: object

In [145]:
matches.str.get(1)

Bob        proton
Vik         gmail
Robb       winter
Ryan    hollywood
Wes           NaN
dtype: object

In [146]:
mail_df = mail_ser.str.extract(regex_str)
mail_df.columns = pd.Index(["username", "domain", "suffix"])
mail_df.dropna()

Unnamed: 0,username,domain,suffix
Bob,bob25,proton,me
Vik,vik.negi,gmail,com
Robb,robb-stark,winter,got
Ryan,ryan_reynolds,hollywood,la


 ### Categorical Data

In [147]:
# The array of distinct values can be called the categories, dictionary, or levels of the data.
fruits = ["apple", "banana", "grapes"]
ser3 = pd.Series(fruits*3)
ser3

0     apple
1    banana
2    grapes
3     apple
4    banana
5    grapes
6     apple
7    banana
8    grapes
dtype: object

In [148]:
ser3.value_counts()

apple     3
banana    3
grapes    3
dtype: int64

In [149]:
ser4 = pd.Series([0,1,2,1,0]*2)
ser3.take(ser4)

0     apple
1    banana
2    grapes
1    banana
0     apple
0     apple
1    banana
2    grapes
1    banana
0     apple
dtype: object

In [150]:
fruits1 = fruits * 2
n_fruits = len(fruits1)
rng = np.random.default_rng(seed=12345)

fruits_df = pd.DataFrame({
  "basket_id": np.arange(n_fruits),
  "fruit": fruits1,
  "count": rng.integers(3,12,size=n_fruits),
  "weight": rng.uniform(0,5,size=n_fruits).round(2)
})
fruits_df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,9,3.38
1,1,banana,5,1.96
2,2,grapes,10,1.66
3,3,apple,5,2.99
4,4,banana,4,0.93
5,5,grapes,10,3.36


In [151]:
fruit_cate = fruits_df["fruit"].astype("category").array
fruit_cate

['apple', 'banana', 'grapes', 'apple', 'banana', 'grapes']
Categories (3, object): ['apple', 'banana', 'grapes']

In [152]:
type(fruit_cate)

pandas.core.arrays.categorical.Categorical

In [153]:
fruit_cate.categories

Index(['apple', 'banana', 'grapes'], dtype='object')

In [154]:
fruit_cate.codes

array([0, 1, 2, 0, 1, 2], dtype=int8)

In [155]:
dict(enumerate(fruit_cate.categories))

{0: 'apple', 1: 'banana', 2: 'grapes'}

In [157]:
cate1 = pd.Categorical(["doo", "bar", "zap", "doo", "zap"])
cate1

['doo', 'bar', 'zap', 'doo', 'zap']
Categories (3, object): ['bar', 'doo', 'zap']

In [158]:
pd.Categorical.from_codes([2,1,0,1,2],cate1.categories,ordered=True)

['zap', 'doo', 'bar', 'doo', 'zap']
Categories (3, object): ['bar' < 'doo' < 'zap']

In [164]:
draws = rng.standard_normal(1000)
bins1 = pd.qcut(draws,4,labels=[f"Q{i}" for i in range(1,5)])
bins1

['Q3', 'Q4', 'Q4', 'Q2', 'Q3', ..., 'Q2', 'Q2', 'Q1', 'Q4', 'Q3']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [165]:
bins1.dtype

CategoricalDtype(categories=['Q1', 'Q2', 'Q3', 'Q4'], ordered=True)

In [166]:
bins1.codes[:5]

array([2, 3, 3, 1, 2], dtype=int8)

In [167]:
bins1 = pd.Series(bins1,name="quartile")
df13 = pd.Series(draws).groupby(bins1).agg(["count","mean","max","min"]).reset_index()
df13

Unnamed: 0,quartile,count,mean,max,min
0,Q1,250,-1.222518,-0.631485,-2.830704
1,Q2,250,-0.279993,0.016428,-0.63029
2,Q3,250,0.33534,0.722118,0.016697
3,Q4,250,1.314522,3.721098,0.722309


In [168]:
df13["quartile"]

0    Q1
1    Q2
2    Q3
3    Q4
Name: quartile, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [171]:
n = 10_000_000
labels = pd.Series(["foo","bar","qux","gif"]*(n//4))
lab_cate = labels.astype("category")
lab_cate.dtype

CategoricalDtype(categories=['bar', 'foo', 'gif', 'qux'], ordered=False)

In [172]:
labels.memory_usage(deep=True)

600000128

In [173]:
lab_cate.memory_usage(deep=True)

10000540

In [174]:
%timeit labels.value_counts()

313 ms ± 20 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [175]:
%timeit lab_cate.value_counts()

38.1 ms ± 2.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [176]:
cate_s1 = pd.Series(list("abcd")*2).astype("category")
cate_s1

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [177]:
cate_s1.cat.set_categories(list("abcde")).value_counts()

a    2
b    2
c    2
d    2
e    0
dtype: int64

In [178]:
cate_s2 = cate_s1[cate_s1.isin(list("ab"))]
cate_s2

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [179]:
cate_s2.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): ['a', 'b']

In [180]:
cate_s3 = pd.Series(["dev","man","adn","fin"]*2,dtype="category")
cate_s3

0    dev
1    man
2    adn
3    fin
4    dev
5    man
6    adn
7    fin
dtype: category
Categories (4, object): ['adn', 'dev', 'fin', 'man']

In [181]:
pd.get_dummies(cate_s3,prefix="jd").iloc[:, :-1]

Unnamed: 0,jd_adn,jd_dev,jd_fin
0,0,1,0
1,0,0,0
2,1,0,0
3,0,0,1
4,0,1,0
5,0,0,0
6,1,0,0
7,0,0,1
