# Feature Engineering Assignments - Solutions

**Today's Exercises:**
- Feature Scaling & Transformation
- Feature Encoding (Categorical Data)
- Feature Cleaning & Imputation

### Importing Libraries & Datasets

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [3]:
data = pd.read_csv('stackoverflow.csv')

In [4]:
data.head()

Unnamed: 0,SurveyDate,FormalEducation,ConvertedSalary,Hobby,Country,StackOverflowJobsRecommend,VersionControl,Age,Years Experience,Gender,RawSalary
0,2/28/18 20:20,Bachelor's degree (BA. BS. B.Eng.. etc.),,Yes,South Africa,,Git,21,13,Male,
1,6/28/18 13:26,Bachelor's degree (BA. BS. B.Eng.. etc.),70841.0,Yes,Sweeden,7.0,Git;Subversion,38,9,Male,70841.00
2,6/6/18 3:37,Bachelor's degree (BA. BS. B.Eng.. etc.),,No,Sweeden,8.0,Git,45,11,,
3,5/9/18 1:06,Some college/university study without earning ...,21426.0,Yes,Sweeden,,Zip file back-ups,46,12,Male,21426.00
4,4/12/18 22:41,Bachelor's degree (BA. BS. B.Eng.. etc.),41671.0,Yes,UK,8.0,Git,39,7,Male,"£41,671.00"


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   SurveyDate                  999 non-null    object 
 1   FormalEducation             999 non-null    object 
 2   ConvertedSalary             665 non-null    float64
 3   Hobby                       999 non-null    object 
 4   Country                     999 non-null    object 
 5   StackOverflowJobsRecommend  487 non-null    float64
 6   VersionControl              999 non-null    object 
 7   Age                         999 non-null    int64  
 8   Years Experience            999 non-null    int64  
 9   Gender                      693 non-null    object 
 10  RawSalary                   665 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 86.0+ KB


==========

## Feature Cleaning & Imputation Exercises

##### Q. Check for any missing data in the 'RawSalary' column and handle them wisely

In [6]:
data.head()

Unnamed: 0,SurveyDate,FormalEducation,ConvertedSalary,Hobby,Country,StackOverflowJobsRecommend,VersionControl,Age,Years Experience,Gender,RawSalary
0,2/28/18 20:20,Bachelor's degree (BA. BS. B.Eng.. etc.),,Yes,South Africa,,Git,21,13,Male,
1,6/28/18 13:26,Bachelor's degree (BA. BS. B.Eng.. etc.),70841.0,Yes,Sweeden,7.0,Git;Subversion,38,9,Male,70841.00
2,6/6/18 3:37,Bachelor's degree (BA. BS. B.Eng.. etc.),,No,Sweeden,8.0,Git,45,11,,
3,5/9/18 1:06,Some college/university study without earning ...,21426.0,Yes,Sweeden,,Zip file back-ups,46,12,Male,21426.00
4,4/12/18 22:41,Bachelor's degree (BA. BS. B.Eng.. etc.),41671.0,Yes,UK,8.0,Git,39,7,Male,"£41,671.00"


In [7]:
data['RawSalary'] = data['RawSalary'].str.replace('$','').str.replace('£','').str.replace(',','')

In [8]:
data['RawSalary']

0             NaN
1        70841.00
2             NaN
3        21426.00
4        41671.00
          ...    
994           NaN
995      58746.00
996      55000.00
997           NaN
998    1000000.00
Name: RawSalary, Length: 999, dtype: object

In [None]:
data['RawSalary'] = data['RawSalary'].astype(np.float64)

In [9]:
data['RawSalary'].isna().sum()

np.int64(334)

In [10]:
from sklearn.impute import SimpleImputer

In [11]:
imp = SimpleImputer()

In [12]:
data['RawSalary'] = imp.fit_transform(data[['RawSalary']])

In [13]:
data['RawSalary'].isna().sum()

np.int64(0)

In [14]:
data['RawSalary']

0        92565.169925
1        70841.000000
2        92565.169925
3        21426.000000
4        41671.000000
            ...      
994      92565.169925
995      58746.000000
996      55000.000000
997      92565.169925
998    1000000.000000
Name: RawSalary, Length: 999, dtype: float64

##### Q. Check for any outliers in the 'RawSalary' column and handle them wisely

In [15]:
data.head()

Unnamed: 0,SurveyDate,FormalEducation,ConvertedSalary,Hobby,Country,StackOverflowJobsRecommend,VersionControl,Age,Years Experience,Gender,RawSalary
0,2/28/18 20:20,Bachelor's degree (BA. BS. B.Eng.. etc.),,Yes,South Africa,,Git,21,13,Male,92565.169925
1,6/28/18 13:26,Bachelor's degree (BA. BS. B.Eng.. etc.),70841.0,Yes,Sweeden,7.0,Git;Subversion,38,9,Male,70841.0
2,6/6/18 3:37,Bachelor's degree (BA. BS. B.Eng.. etc.),,No,Sweeden,8.0,Git,45,11,,92565.169925
3,5/9/18 1:06,Some college/university study without earning ...,21426.0,Yes,Sweeden,,Zip file back-ups,46,12,Male,21426.0
4,4/12/18 22:41,Bachelor's degree (BA. BS. B.Eng.. etc.),41671.0,Yes,UK,8.0,Git,39,7,Male,41671.0


In [16]:
data['RawSalary'].min()

0.0

In [17]:
data['RawSalary'].max()

2000000.0

In [19]:
data['RawSalary'].mean()

np.float64(92565.16992481204)

In [20]:
data['RawSalary'].plot.box()


<Axes: >

In [21]:
iqr = data['RawSalary'].quantile(0.75) - data['RawSalary'].quantile(0.25)
iqr

np.float64(50923.66992481203)

In [22]:
lower = data['RawSalary'].quantile(0.25) - 1.5 * iqr
lower

np.float64(-34744.004887218034)

In [23]:
upper = data['RawSalary'].quantile(0.75) + 1.5 * iqr
upper

np.float64(168950.67481203005)

In [24]:
data = data[(data['RawSalary'] >= lower)&(data['RawSalary'] <= upper)]

In [None]:
data['RawSalary'].plot.hist()

In [None]:
data['RawSalary'].plot.box()

==========

## Feature Scaling & Normalizing Exercises

##### Q. Let's scale / normalize the data in the 'Age' column

In [None]:
data.head()

In [None]:
data['Age'].plot.hist()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
sc = MinMaxScaler()

In [None]:
data['Age'] = sc.fit_transform(data[['Age']])

In [None]:
data['Age']

In [None]:
data['Age'].plot.hist()

In [None]:
# Another Solution using 'StandardScaler'

'''
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
data['Age'] = sc.fit_transform(data[['Age']])
data['Age']
data['Age'].plot.hist()
'''

==========

## Feature Encoding Exercises

##### Q. How about encoding the data in the 'Hobby' / 'Gender' column

In [None]:
data.head()

In [None]:
data['Hobby'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
enc = LabelEncoder()

In [None]:
data['Hobby'] = enc.fit_transform(data[['Hobby']])

In [None]:
data.head()

In [None]:
data['Hobby'].value_counts()

##### Q. And also, we need to encode the 'Country' column

In [None]:
data.head()

In [None]:
data['Country'].value_counts()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder()

In [None]:
enc.fit_transform(data[['Country']])

In [None]:
enc.get_feature_names(['Country'])

In [None]:
data_enc = pd.DataFrame(enc.fit_transform(data[['Country']]).toarray())

In [None]:
pd.concat([data, data_enc], axis=1, names=[enc.get_feature_names()]).head()

===========

# THANK YOU!