# Data Preprocessing Steps:


## Import the libraries

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

## Import and examine the dataset
- Do we have any missing values?
- Show the dataframe sorted by State
- Show the means of numeric features by State
- Compute the min, max and mean of Salary and Age by state

Use a many cells as needed to show your work

[Dataset.csv](https://drive.google.com/file/d/1Sq7OQ-jMWFlF6Zamz5_RmIvjUt9d1THb/view?usp=sharing)

In [83]:
#read the file
df = pd.read_csv("data/Dataset.csv")


In [84]:
#show first 5 lines 
df.head()

Unnamed: 0,State,Salary,Age,Paid_back
0,New York,80000.0,35.0,Yes
1,New Jersey,60000.0,47.0,Yes
2,Pennsylvania,50000.0,61.0,No
3,New Jersey,,29.0,No
4,New Jersey,90000.0,70.0,No


In [85]:
df.describe()

Unnamed: 0,Salary,Age
count,9.0,9.0
mean,63888.888889,44.555556
std,18617.494759,13.974183
min,40000.0,29.0
25%,50000.0,35.0
50%,60000.0,42.0
75%,80000.0,50.0
max,90000.0,70.0


In [86]:
df_sorted = df.sort_values(by='State')
df_sorted #states get grouped by the same name

Unnamed: 0,State,Salary,Age,Paid_back
1,New Jersey,60000.0,47.0,Yes
3,New Jersey,,29.0,No
4,New Jersey,90000.0,70.0,No
9,New Jersey,72000.0,42.0,Yes
0,New York,80000.0,35.0,Yes
5,New York,40000.0,37.0,Yes
7,New York,85000.0,,No
2,Pennsylvania,50000.0,61.0,No
6,Pennsylvania,56000.0,30.0,Yes
8,Pennsylvania,42000.0,50.0,Yes


In [87]:
df_group_by = df_sorted.groupby('State')[['Salary','Age']].agg(['mean','max','min'])
#agregates mean min and max
df_group_by

Unnamed: 0_level_0,Salary,Salary,Salary,Age,Age,Age
Unnamed: 0_level_1,mean,max,min,mean,max,min
State,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
New Jersey,74000.0,90000.0,60000.0,47.0,70.0,29.0
New York,68333.333333,85000.0,40000.0,36.0,37.0,35.0
Pennsylvania,49333.333333,56000.0,42000.0,47.0,61.0,30.0


In [88]:
#show full table
df

Unnamed: 0,State,Salary,Age,Paid_back
0,New York,80000.0,35.0,Yes
1,New Jersey,60000.0,47.0,Yes
2,Pennsylvania,50000.0,61.0,No
3,New Jersey,,29.0,No
4,New Jersey,90000.0,70.0,No
5,New York,40000.0,37.0,Yes
6,Pennsylvania,56000.0,30.0,Yes
7,New York,85000.0,,No
8,Pennsylvania,42000.0,50.0,Yes
9,New Jersey,72000.0,42.0,Yes


In [89]:
#show mean, std, min and max
df.describe()

Unnamed: 0,Salary,Age
count,9.0,9.0
mean,63888.888889,44.555556
std,18617.494759,13.974183
min,40000.0,29.0
25%,50000.0,35.0
50%,60000.0,42.0
75%,80000.0,50.0
max,90000.0,70.0


In [90]:
#retrieve the mean value
#will give us the mean of the columns and have only numbers in it, it will ignore the paid back one
#if paid back is not added, we will encounter an error
mean_value = df.mean(axis=0, skipna=True,numeric_only=True)
print(mean_value)

Salary    63888.888889
Age          44.555556
dtype: float64


## Take Care of missing data
1.  Delete rows with missing data, or
2.  Replace missing data with mean values.

In [91]:
#create a new series that holds the value of the missing data 
#Specifically:
# 1. New Jersey Salary (NaN) 
# 2. New York age (NaN)
missing_values = pd.Series(['Salary','Age'])


In [92]:
#count NaN values in each column
count_NaN_values = df[missing_values].isna().sum()
print(count_NaN_values)


Salary    1
Age       1
dtype: int64


In [93]:
df[missing_values] = df[missing_values].fillna(mean_value)
print(df)

          State        Salary        Age Paid_back
0      New York  80000.000000  35.000000       Yes
1    New Jersey  60000.000000  47.000000       Yes
2  Pennsylvania  50000.000000  61.000000        No
3    New Jersey  63888.888889  29.000000        No
4    New Jersey  90000.000000  70.000000        No
5      New York  40000.000000  37.000000       Yes
6  Pennsylvania  56000.000000  30.000000       Yes
7      New York  85000.000000  44.555556        No
8  Pennsylvania  42000.000000  50.000000       Yes
9    New Jersey  72000.000000  42.000000       Yes


## Encode Categorical data

#### Encode and display the categorical Independent Variable

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   State      10 non-null     object 
 1   Salary     10 non-null     float64
 2   Age        10 non-null     float64
 3   Paid_back  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [95]:
df.head()

Unnamed: 0,State,Salary,Age,Paid_back
0,New York,80000.0,35.0,Yes
1,New Jersey,60000.0,47.0,Yes
2,Pennsylvania,50000.0,61.0,No
3,New Jersey,63888.888889,29.0,No
4,New Jersey,90000.0,70.0,No


In [96]:
numeric_columns = ['Salary','Age']

In [97]:
df['Paid_back'].unique()

array(['Yes', 'No'], dtype=object)

In [98]:
#get_dummies method to encode categorical features
#The third column will be dropped when we instantiate drop  = True, significa esto que apenas encuentre su primer True,
#va a hcer drop de las demas columnas

df = pd.get_dummies(df,columns=["State"],drop_first = True)  



In [99]:
df

Unnamed: 0,Salary,Age,Paid_back,State_New York,State_Pennsylvania
0,80000.0,35.0,Yes,True,False
1,60000.0,47.0,Yes,False,False
2,50000.0,61.0,No,False,True
3,63888.888889,29.0,No,False,False
4,90000.0,70.0,No,False,False
5,40000.0,37.0,Yes,True,False
6,56000.0,30.0,Yes,False,True
7,85000.0,44.555556,No,True,False
8,42000.0,50.0,Yes,False,True
9,72000.0,42.0,Yes,False,False


#### Encode and display the Dependent Variable

In [101]:
df["State_New York"].unique()

array([ True, False])

In [102]:
df["label"] = np.where(df["Paid_back"].str.contains("Yes"), 1, 0)
df

Unnamed: 0,Salary,Age,Paid_back,State_New York,State_Pennsylvania,label
0,80000.0,35.0,Yes,True,False,1
1,60000.0,47.0,Yes,False,False,1
2,50000.0,61.0,No,False,True,0
3,63888.888889,29.0,No,False,False,0
4,90000.0,70.0,No,False,False,0
5,40000.0,37.0,Yes,True,False,1
6,56000.0,30.0,Yes,False,True,1
7,85000.0,44.555556,No,True,False,0
8,42000.0,50.0,Yes,False,True,1
9,72000.0,42.0,Yes,False,False,1


## Split the dataset into training and testing sets

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Salary              10 non-null     float64
 1   Age                 10 non-null     float64
 2   Paid_back           10 non-null     object 
 3   State_New York      10 non-null     bool   
 4   State_Pennsylvania  10 non-null     bool   
 5   label               10 non-null     int64  
dtypes: bool(2), float64(2), int64(1), object(1)
memory usage: 468.0+ bytes


In [113]:
# Store relevant columns as variables
X = df[numeric_columns].values

# create a 1-D numpy array
y = df[['label']].values.ravel()

In [114]:
X.shape

(10, 2)

In [115]:
y.shape

(10,)

In [116]:
type(df[numeric_columns].values)

numpy.ndarray

In [117]:
trainX,testX,trainY,testY = train_test_split(X, y, test_size=.2, random_state=42)

print('Split X: ',trainX.shape, testX.shape)
print('Split Y: ',trainY.shape, testY.shape)

Split X:  (8, 2) (2, 2)
Split Y:  (8,) (2,)


## Feature Scaling

#### Standardized Scaling
* Scale the numerical features using standardized scaling
* Show your work

In [118]:
original = df[numeric_columns]
# Standardize dataframe and return as an array
standardizedArray = preprocessing.scale(original)

# Convert standardized array to dataframe 'standardized'
standardized = pd.DataFrame(standardizedArray, columns=numeric_columns)

In [119]:
standardized

Unnamed: 0,Salary,Age
0,0.967518,-0.764512
1,-0.233539,0.195573
2,-0.834068,1.315673
3,0.0,-1.244555
4,1.568047,2.035737
5,-1.434596,-0.604498
6,-0.47375,-1.164548
7,1.267783,0.0
8,-1.314491,0.435594
9,0.487095,-0.204463


#### Normalized Scaling
* Scale the numerical features using normalized scaling
* Show your work

In [120]:
# Normalize dataframe and return as an array
normalizedArray = preprocessing.MinMaxScaler().fit_transform(df[numeric_columns])

# Convert normalized array to dataframe 'normalized'
normalized = pd.DataFrame(normalizedArray, columns=numeric_columns)
normalized

Unnamed: 0,Salary,Age
0,0.8,0.146341
1,0.4,0.439024
2,0.2,0.780488
3,0.477778,0.0
4,1.0,1.0
5,0.0,0.195122
6,0.32,0.02439
7,0.9,0.379404
8,0.04,0.512195
9,0.64,0.317073
