In [31]:
import pandas as pd
import numpy as np

In [36]:
df = pd.read_csv('eda_label_encoding_dataset.csv')

# Decrease to data to study smarter

df = df.sample(20)

In [58]:
df.head()

Unnamed: 0,date,number_of_strikes,center_point_geom,month_name
2521176,2017-02-16,2,POINT(-80.9 24.5),February
1889969,2017-06-16,7,POINT(-103.8 31.1),June
2698846,2017-08-28,2,POINT(-82.8 40.6),August
739581,2017-01-21,4,POINT(-94.4 31.5),January
2717489,2017-06-05,3,POINT(-80.4 42.2),June


pd.isna() is a pandas function that returns a same-sized Boolean array indicating whether each value is null (you can also use pd.isnull() as an alias). Note that this function also exists as a DataFrame method.

In [39]:
#Check 

pd.isnull(df)

Unnamed: 0,date,number_of_strikes,center_point_geom
2521176,False,False,False
1889969,False,False,False
2698846,False,False,False
739581,False,False,False
2717489,False,False,False
4541246,False,False,False
343952,False,False,False
7170455,False,False,False
3413949,False,False,False
1533219,False,False,False


In [41]:
# Filling missing values if there is one

df.fillna(2)

Unnamed: 0,date,number_of_strikes,center_point_geom
2521176,2017-02-16,2,POINT(-80.9 24.5)
1889969,2017-06-16,7,POINT(-103.8 31.1)
2698846,2017-08-28,2,POINT(-82.8 40.6)
739581,2017-01-21,4,POINT(-94.4 31.5)
2717489,2017-06-05,3,POINT(-80.4 42.2)
4541246,2016-05-02,1,POINT(-102.2 32.2)
343952,2016-03-24,1,POINT(-94.8 32.3)
7170455,2016-07-16,8,POINT(-93 21.6)
3413949,2017-07-01,1,POINT(-101.1 20.7)
1533219,2018-07-08,2,POINT(-93.8 29.6)


In [42]:
# Replace values in the data 

#df.replace('change', 'replace')

In [44]:
# Dropping  null values by rows or columns

df.dropna(axis = 0) # Rows
df.dropna(axis = 1) # Columns

# You can select columns as well to drop

Unnamed: 0,date,number_of_strikes,center_point_geom
2521176,2017-02-16,2,POINT(-80.9 24.5)
1889969,2017-06-16,7,POINT(-103.8 31.1)
2698846,2017-08-28,2,POINT(-82.8 40.6)
739581,2017-01-21,4,POINT(-94.4 31.5)
2717489,2017-06-05,3,POINT(-80.4 42.2)
4541246,2016-05-02,1,POINT(-102.2 32.2)
343952,2016-03-24,1,POINT(-94.8 32.3)
7170455,2016-07-16,8,POINT(-93 21.6)
3413949,2017-07-01,1,POINT(-101.1 20.7)
1533219,2018-07-08,2,POINT(-93.8 29.6)


In [47]:
# Describe the data to check to distribition and see if there is any outliers
# Standart devation gives a good example here to check outliers

df.describe()

Unnamed: 0,number_of_strikes
count,20.0
mean,2.4
std,2.010499
min,1.0
25%,1.0
50%,2.0
75%,3.0
max,8.0


In [57]:
# Check the data type of the columns
# df.info does this as well but this more straightforward

df.dtypes

date                 datetime64[ns]
number_of_strikes             int64
center_point_geom            object
month_name                   object
dtype: object

In [59]:
# Label encoding is categorizing the specific column to represent it by numbers

# First we need a string column and for that we will convert the date column to datetime first

df['date'] = pd.to_datetime(df['date'])

# Then we create a month name column for the months

df['month_name'] = df['date'].dt.month_name()

# Now we convert the month_name column to a caterogy data type

df['month_name'] = df['month_name'].astype('category')

df.dtypes

date                 datetime64[ns]
number_of_strikes             int64
center_point_geom            object
month_name                 category
dtype: object

In [65]:
# Using pandas built in cat.codes function we will convert month names to numbers

df['month_name'].cat.codes

2521176    2
1889969    5
2698846    1
739581     3
2717489    5
4541246    7
343952     6
7170455    4
3413949    4
1533219    4
690960     7
3791689    4
1320516    5
8888760    8
3584529    4
1241604    3
6434797    0
4681496    4
1404707    7
8724062    6
dtype: int8

In [67]:
# Converting categorical values into new binary columns for each different category

pd.get_dummies(df['month_name'])

Unnamed: 0,April,August,February,January,July,June,March,May,October
2521176,0,0,1,0,0,0,0,0,0
1889969,0,0,0,0,0,1,0,0,0
2698846,0,1,0,0,0,0,0,0,0
739581,0,0,0,1,0,0,0,0,0
2717489,0,0,0,0,0,1,0,0,0
4541246,0,0,0,0,0,0,0,1,0
343952,0,0,0,0,0,0,1,0,0
7170455,0,0,0,0,1,0,0,0,0
3413949,0,0,0,0,1,0,0,0,0
1533219,0,0,0,0,1,0,0,0,0


In [68]:
# Sci-kit learn preprocessing library also allows to encodes specified categories or labels
# with numeric codes

from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder() 
encoder = LabelEncoder()

data = [1, 2, 2, 6]

# Fit to the data
encoder.fit(data)

# Transform the data
transformed = encoder.transform(data)

# Reverse the transformation
inverse = encoder.inverse_transform(transformed)

print('Data =', data)
print('\n Classes: \n', encoder.classes_)
print('\n Encoded (normalized) classes: \n', transformed)
print('\n Reverse from encoded classes to original: \n', inverse)


Data = [1, 2, 2, 6]

 Classes: 
 [1 2 6]

 Encoded (normalized) classes: 
 [0 1 1 2]

 Reverse from encoded classes to original: 
 [1 2 2 6]


In [69]:
# It can also be used to convert categorical labels into numeric

from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder()
encoder = LabelEncoder()

data = ['paris', 'paris', 'tokyo', 'amsterdam']

# Fit to the data
encoder.fit(data)

# Transform the data
transformed = encoder.transform(data)

# New data
new_data = [0, 2, 1, 1, 2]

# Get classes of new data
inverse = encoder.inverse_transform(new_data)

print('Data =', data)
print('\n Classes: \n', list(encoder.classes_))
print('\n Encoded classes: \n', transformed)
print('\n New data =', new_data)
print('\n Convert new_data to original classes: \n', list(inverse))

Data = ['paris', 'paris', 'tokyo', 'amsterdam']

 Classes: 
 ['amsterdam', 'paris', 'tokyo']

 Encoded classes: 
 [1 1 2 0]

 New data = [0, 2, 1, 1, 2]

 Convert new_data to original classes: 
 ['amsterdam', 'tokyo', 'paris', 'paris', 'tokyo']
