In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('/content/sample_data/california_housing_train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  float64
 4   total_bedrooms      17000 non-null  float64
 5   population          17000 non-null  float64
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
dtypes: float64(9)
memory usage: 1.2 MB


In [3]:
df.head(7)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0
5,-114.58,33.63,29.0,1387.0,236.0,671.0,239.0,3.3438,74000.0
6,-114.58,33.61,25.0,2907.0,680.0,1841.0,633.0,2.6768,82400.0


In [4]:
# Split the data into x and y and then again split into train and test



In [5]:
print(df.shape)
y = df['median_house_value']
X = df.drop(['median_house_value'], axis=1)
print(X.shape,y.shape)
# X has 17000 rows and 8 coloumns where df is same, we didnt change the actual data frame

(17000, 9)
(17000, 8) (17000,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2,shuffle = True
                                                    , random_state = 0)
# Random state is to select same number of rows everytime. 

In [7]:
print(X_train.shape, X_test.shape,y_train.shape,y_test.shape )

(13600, 8) (3400, 8) (13600,) (3400,)


In [8]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
12036,-121.41,38.57,16.0,4429.0,1124.0,1538.0,960.0,3.2443
8210,-118.44,34.19,19.0,3487.0,959.0,2278.0,835.0,2.6709
15153,-122.26,37.77,52.0,1565.0,315.0,637.0,297.0,4.7778
8022,-118.42,34.24,36.0,1181.0,220.0,775.0,218.0,4.7228
133,-116.06,34.15,15.0,10377.0,2331.0,4507.0,1807.0,2.2466


## Standard Scaler()


In [9]:
# z = (x-u)/s, data will have zero mean and unit variance, here we calculate how many standard deviation  we are away from the meanfor specific data
# Dummy data
x = np.arange(0,10,1)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [10]:
print('Mean of the dummy data is : ',np.mean(x))
print('The sd is : ', np.std(x))
standardize_x = (x - np.mean(x))/np.std(x)
standardize_x

Mean of the dummy data is :  4.5
The sd is :  2.8722813232690143


array([-1.5666989 , -1.21854359, -0.87038828, -0.52223297, -0.17407766,
        0.17407766,  0.52223297,  0.87038828,  1.21854359,  1.5666989 ])

In [11]:
#Most of the data is not normally distributed. 

## MinMax Scaler()
 

*   Values are between 0 and 1.
*   Genreally used when data is not normally distributed
*   (x - x_min)/(x_max - x_min)





In [12]:
y = np.arange(0,10,1)
y

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [13]:
print('The minimum value is : ', np.min(y))
print('The maximum value is : ', np.max(y))
normalized_y = (y-np.min(y))/(np.max(y) - np.min(y))
normalized_y

The minimum value is :  0
The maximum value is :  9


array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])

# Scikit learn implementation of Standardization and Normalization

In [14]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
type(X_train) # Min Max scaler converted to numpy array, numpy array is faster than dataframe

numpy.ndarray

In [16]:
tmp_X_train = pd.DataFrame(X_train)
tmp_X_train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,13600.0,13600.0,13600.0,13600.0,13600.0,13600.0,13600.0,13600.0
mean,0.47407,0.327796,0.540745,0.06972,0.083525,0.040052,0.082281,0.234691
std,0.200924,0.227627,0.247055,0.058209,0.065985,0.032975,0.063916,0.13295
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.251251,0.147715,0.333333,0.038408,0.045934,0.021974,0.04621,0.14306
50%,0.581582,0.180659,0.54902,0.055951,0.066729,0.032456,0.066765,0.210801
75%,0.630631,0.550478,0.705882,0.082938,0.100403,0.048159,0.099161,0.295265
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Minimum and maximum value of X_train may not be present in X_test. Thus minimum and maximum value must not be 0 and 1. 

In [17]:
tmp_X_test = pd.DataFrame(X_test)
tmp_X_test.describe() 

Unnamed: 0,0,1,2,3,4,5,6,7
count,3400.0,3400.0,3400.0,3400.0,3400.0,3400.0,3400.0,3400.0
mean,0.475038,0.328151,0.541857,0.069304,0.08366,0.039708,0.082173,0.228005
std,0.199917,0.225189,0.245824,0.054398,0.063065,0.028736,0.060433,0.125906
min,-0.005005,0.003188,0.0,0.000343,0.000621,0.000336,0.000164,0.0
25%,0.254254,0.148778,0.333333,0.038724,0.045779,0.022254,0.046045,0.140324
50%,0.57958,0.183847,0.54902,0.056505,0.068746,0.033423,0.068081,0.205532
75%,0.62963,0.549416,0.705882,0.083314,0.101179,0.048067,0.099984,0.29073
max,0.971972,0.98406,1.0,0.693818,0.648976,0.307884,0.603519,1.0


Feature scaling should be done only after splitting the data first, because if we dont do it in that way minimum and maximum value of the whole data would be same

* Use fit_transform on X_train 
* Use transform on X_test