# Feature Engineering Assignments - Solutions

**Today's Exercises:**
- Feature Scaling & Transformation
- Feature Encoding (Categorical Data)
- Feature Cleaning & Imputation

### Importing Libraries & Datasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [None]:
data = pd.read_csv('data/stackoverflow.csv')

In [None]:
data.head()

In [None]:
data.info()

==========

## Feature Cleaning & Imputation Exercises

##### Q. Check for any missing data in the 'RawSalary' column and handle them wisely

In [None]:
data.head()

In [None]:
data['RawSalary'] = data['RawSalary'].str.replace('$','').str.replace('Â£','').str.replace(',','')

In [None]:
data['RawSalary']

In [None]:
data['RawSalary'] = data['RawSalary'].astype(np.float64)

In [None]:
data['RawSalary'].isna().sum()

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imp = SimpleImputer()

In [None]:
data['RawSalary'] = imp.fit_transform(data[['RawSalary']])

In [None]:
data['RawSalary'].isna().sum()

##### Q. Check for any outliers in the 'RawSalary' column and handle them wisely

In [None]:
data.head()

In [None]:
data['RawSalary'].min()

In [None]:
data['RawSalary'].max()

In [None]:
data['RawSalary'].mean()

In [None]:
data['RawSalary'].plot.box()

In [None]:
iqr = data['RawSalary'].quantile(0.75) - data['RawSalary'].quantile(0.25)
iqr

In [None]:
lower = data['RawSalary'].quantile(0.25) - 1.5 * iqr
lower

In [None]:
upper = data['RawSalary'].quantile(0.75) + 1.5 * iqr
upper

In [None]:
data = data[(data['RawSalary'] >= lower)&(data['RawSalary'] <= upper)]

In [None]:
data['RawSalary'].plot.hist()

In [None]:
data['RawSalary'].plot.box()

==========

## Feature Scaling & Normalizing Exercises

##### Q. Let's scale / normalize the data in the 'Age' column

In [None]:
data.head()

In [None]:
data['Age'].plot.hist()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
sc = MinMaxScaler()

In [None]:
data['Age'] = sc.fit_transform(data[['Age']])

In [None]:
data['Age']

In [None]:
data['Age'].plot.hist()

In [None]:
# Another Solution using 'StandardScaler'

'''
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
data['Age'] = sc.fit_transform(data[['Age']])
data['Age']
data['Age'].plot.hist()
'''

==========

## Feature Encoding Exercises

##### Q. How about encoding the data in the 'Hobby' / 'Gender' column

In [None]:
data.head()

In [None]:
data['Hobby'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
enc = LabelEncoder()

In [None]:
data['Hobby'] = enc.fit_transform(data[['Hobby']])

In [None]:
data.head()

In [None]:
data['Hobby'].value_counts()

##### Q. And also, we need to encode the 'Country' column

In [None]:
data.head()

In [None]:
data['Country'].value_counts()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder()

In [None]:
enc.fit_transform(data[['Country']])

In [None]:
enc.get_feature_names(['Country'])

In [None]:
data_enc = pd.DataFrame(enc.fit_transform(data[['Country']]).toarray())

In [None]:
pd.concat([data, data_enc], axis=1, names=[enc.get_feature_names()]).head()

===========

# THANK YOU!