In [23]:
# imports
import pandas as pd
import numpy as np
from numpy import NaN
from sklearn import preprocessing

In [37]:
# load data
dataFrame = pd.read_csv('/content/sample_data/weather.csv')
dataFrame.head()


Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain


# Major Tasks in Data Pre-processing:
1. **Data cleaning:** Fill in missing values, smooth noisy data, identify or remove outliers, and
resolve inconsistencies.

1. **Data integration:** Integration of multiple databases, data cubes, or files.

1. **Data transformation:** Normalization and aggregation.

1. **Data reduction:** Obtains reduced representation in volume but produces the
similar analytical results.

1. **Data discretization:** Part of data reduction but with particular importance, especially for
numerical data.


## 1. Data Cleaning
### Ways to Cleanse Missing Data in Python
To perform a Python data cleansing, you can:

1. drop the missing values
1. replace them
1. replace each NaN with a scalar value
1. fill forward or backward.

In [5]:
# Drop the missing values
print(dataFrame.dropna())

         day  temperature  windspeed   event
0   1/1/2017         32.0        6.0    Rain
7  1/10/2017         34.0        8.0  Cloudy
8  1/11/2017         40.0       12.0   Sunny


### Output:
Here we can see that all the NaN value data rows are droped.


In [6]:
# replace NaN
print(dataFrame.replace({NaN:0}))

         day  temperature  windspeed   event
0   1/1/2017         32.0        6.0    Rain
1   1/4/2017          0.0        9.0   Sunny
2   1/5/2017         28.0        0.0    Snow
3   1/6/2017          0.0        7.0       0
4   1/7/2017         32.0        0.0    Rain
5   1/8/2017          0.0        0.0   Sunny
6   1/9/2017          0.0        0.0       0
7  1/10/2017         34.0        8.0  Cloudy
8  1/11/2017         40.0       12.0   Sunny


In [7]:
# replace with a scalar value
print(dataFrame.fillna(8))

         day  temperature  windspeed   event
0   1/1/2017         32.0        6.0    Rain
1   1/4/2017          8.0        9.0   Sunny
2   1/5/2017         28.0        8.0    Snow
3   1/6/2017          8.0        7.0       8
4   1/7/2017         32.0        8.0    Rain
5   1/8/2017          8.0        8.0   Sunny
6   1/9/2017          8.0        8.0       8
7  1/10/2017         34.0        8.0  Cloudy
8  1/11/2017         40.0       12.0   Sunny


In [8]:
# filling forward
print(dataFrame.fillna(method='pad'))


         day  temperature  windspeed   event
0   1/1/2017         32.0        6.0    Rain
1   1/4/2017         32.0        9.0   Sunny
2   1/5/2017         28.0        9.0    Snow
3   1/6/2017         28.0        7.0    Snow
4   1/7/2017         32.0        7.0    Rain
5   1/8/2017         32.0        7.0   Sunny
6   1/9/2017         32.0        7.0   Sunny
7  1/10/2017         34.0        8.0  Cloudy
8  1/11/2017         40.0       12.0   Sunny


In [10]:
# filling backward
print(dataFrame.fillna(method='bfill'))


         day  temperature  windspeed   event
0   1/1/2017         32.0        6.0    Rain
1   1/4/2017         28.0        9.0   Sunny
2   1/5/2017         28.0        7.0    Snow
3   1/6/2017         32.0        7.0    Rain
4   1/7/2017         32.0        8.0    Rain
5   1/8/2017         34.0        8.0   Sunny
6   1/9/2017         34.0        8.0  Cloudy
7  1/10/2017         34.0        8.0  Cloudy
8  1/11/2017         40.0       12.0   Sunny


# 2) Data Intigration
we'll merge the details of students from two datasets, namely student.csv and marks.csv.
The student dataset contains columns such as Age, Gender, Grade, and Employed.

In [30]:
dataFrame1 =  pd.read_csv('/content/sample_data/student.csv')
dataFrame2 =  pd.read_csv('/content/sample_data/mark.csv')
print(dataFrame2)

     Student_id  Mark     City
0             1    95  Chennai
1             2    70    Delhi
2             3    98   Mumbai
3             4    75     Pune
4             5    89    Kochi
..          ...   ...      ...
227         228    99     Pune
228         229    70  Chennai
229         230    55    Delhi
230         231    97   Mumbai
231         232    59     Pune

[232 rows x 3 columns]


In [31]:
newDataFrame = pd.merge(dataFrame1, dataFrame2, on='Student_id')

In [32]:
print(newDataFrame)

     Student_id  Age  Gender      Grade Employed  Mark     City
0             1   19    Male  1st Class      yes    95  Chennai
1             2   20  Female  2nd Class       no    70    Delhi
2             3   18    Male  1st Class       no    98   Mumbai
3             4   21  Female  2nd Class       no    75     Pune
4             5   19    Male  1st Class       no    89    Kochi
..          ...  ...     ...        ...      ...   ...      ...
227         228   21  Female  1st Class       no    99     Pune
228         229   20    Male  2nd Class       no    70  Chennai
229         230   20    Male  3rd Class      yes    55    Delhi
230         231   19  Female  1st Class      yes    97   Mumbai
231         232   20    Male  3rd Class      yes    59     Pune

[232 rows x 7 columns]


In [33]:
# Normalization
a = np.random.random((1,4))
a *= 20
print('Data',a)
normal = preprocessing.normalize(a)
print('normalized', normal) 

Data [[ 2.47807062 19.2115389   2.67422577 15.23536842]]
normalized [[0.09996675 0.77500417 0.10787976 0.61460324]]


In [35]:
# Descretization

print(dataFrame1)


     Student_id  Age  Gender      Grade Employed
0             1   19    Male  1st Class      yes
1             2   20  Female  2nd Class       no
2             3   18    Male  1st Class       no
3             4   21  Female  2nd Class       no
4             5   19    Male  1st Class       no
..          ...  ...     ...        ...      ...
227         228   21  Female  1st Class       no
228         229   20    Male  2nd Class       no
229         230   20    Male  3rd Class      yes
230         231   19  Female  1st Class      yes
231         232   20    Male  3rd Class      yes

[232 rows x 5 columns]


In [36]:
dataFrame1['bucket']=pd.cut(dataFrame1['Age'],3,labels=['young','old','middleage'])
dataFrame1.head(10)


Unnamed: 0,Student_id,Age,Gender,Grade,Employed,bucket
0,1,19,Male,1st Class,yes,young
1,2,20,Female,2nd Class,no,old
2,3,18,Male,1st Class,no,young
3,4,21,Female,2nd Class,no,middleage
4,5,19,Male,1st Class,no,young
5,6,20,Male,2nd Class,yes,old
6,7,19,Female,3rd Class,yes,young
7,8,21,Male,3rd Class,yes,middleage
8,9,22,Female,3rd Class,yes,middleage
9,10,21,Male,1st Class,no,middleage
