# Preprocessing with NumPy

In [3]:
import numpy as np 

## Checking for Missing Values

In [4]:
lending_co_data_numeric = np.loadtxt('Lending-company-Numeric.csv', delimiter = ',')
lending_co_data_numeric

np.isnan(lending_co_data_numeric).sum()

0

In [5]:
lending_co_data_numeric_nan = np.genfromtxt('Lending-company-Numeric-NAN.csv', delimiter = ';')
lending_co_data_numeric_nan

np.isnan(lending_co_data_numeric_nan).sum()

260

In [6]:
lending_co_data_numeric_nan = np.genfromtxt('Lending-company-Numeric-NAN.csv', delimiter = ';', filling_values=0)
lending_co_data_numeric_nan

np.average(lending_co_data_numeric_nan), np.max(lending_co_data_numeric_nan)

(4560.5249280920425, 64001.0)

In [7]:
temporary_fill = np.nanmax(lending_co_data_numeric_nan).round(2)+1
temporary_fill

64002.0

In [8]:
lending_co_data_numeric_nan = np.genfromtxt('Lending-company-Numeric-NAN.csv', delimiter = ';', filling_values=temporary_fill)
np.average(lending_co_data_numeric_nan)


7219.604506232023

## Substituting Missing Values

In [9]:
temporary_mean = np.nanmean(lending_co_data_numeric_nan, axis=0).round(2)+1
temporary_mean

array([ 4264.25,  2990.43,  3538.7 ,  7469.93,  7305.54, 17754.79])

In [10]:
lending_co_data_numeric_nan[:, 0] = np.where(lending_co_data_numeric_nan[:, 0] == temporary_fill, 
                                                    temporary_mean[0], 
                                                    lending_co_data_numeric_nan[:,0])

In [11]:
np.mean(lending_co_data_numeric_nan[:, 0]).round(2)

2315.9

In [12]:
for i in range(lending_co_data_numeric_nan.shape[1]):
    lending_co_data_numeric_nan[:, i] = np.where(lending_co_data_numeric_nan[:, i] == temporary_fill, 
                                                    temporary_mean[i], 
                                                    lending_co_data_numeric_nan[:,i])

In [13]:
np.mean(lending_co_data_numeric_nan).round(2)

4828.17

In [14]:
# we can use np.where to replace nbegative values as well with zero for example
for i in range(lending_co_data_numeric_nan.shape[1]):
    lending_co_data_numeric_nan[:, i] = np.where(lending_co_data_numeric_nan[:, i] == temporary_fill, 
                                                    temporary_mean[i], 
                                                    lending_co_data_numeric_nan[:,i])

## Reshaping

In [16]:
lending_co_data_numeric = np.loadtxt('Lending-company-Numeric.csv', delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [17]:
lending_co_data_numeric.shape

(1043, 6)

In [25]:
np.reshape(lending_co_data_numeric, (6,1043))

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [20]:
np.transpose(lending_co_data_numeric)

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

In [23]:
# you need to have the size of your reshape same product as size of the original shape
np.reshape(lending_co_data_numeric, (2,3,1043))

array([[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
        [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
        [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

       [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
        [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
        [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]])

## Removing Values

In [26]:
lending_co_data_numeric = np.loadtxt('Lending-company-Numeric.csv', delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [28]:
# removing first element in the array
np.delete(lending_co_data_numeric, 0)

array([   40.,   365.,  3121., ...,  4601.,  4601., 16600.])

In [31]:
# removing column or row in the array
np.delete(lending_co_data_numeric, 0, axis=0) , np.delete(lending_co_data_numeric, (0,2,4), axis=1)

(array([[ 2000.,    40.,   365.,  3061.,  4171., 15041.],
        [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
        [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
        ...,
        [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
        [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
        [ 2000.,    40.,   365.,  4601.,  4601., 16600.]]),
 array([[   40.,  3121., 13621.],
        [   40.,  3061., 15041.],
        [   40.,  2160., 15340.],
        ...,
        [   40.,  4201., 16600.],
        [   40.,  2080., 15600.],
        [   40.,  4601., 16600.]]))

## Sorting Data

In [32]:
lending_co_data_numeric = np.loadtxt('Lending-company-Numeric.csv', delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [38]:
np.sort(lending_co_data_numeric), np.sort(lending_co_data_numeric, axis=0), np.sort(lending_co_data_numeric, axis=None), 

# sorting by columns, sorting by values in rows, flattening and sorting in sametime

(array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
        [   40.,   365.,  2000.,  3061.,  4171., 15041.],
        [   40.,   365.,  1000.,  2160.,  3280., 15340.],
        ...,
        [   40.,   365.,  2000.,  4201.,  5001., 16600.],
        [   40.,   365.,  1000.,  2080.,  3320., 15600.],
        [   40.,   365.,  2000.,  4601.,  4601., 16600.]]),
 array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
        [ 1000.,    35.,   365., -2550., -2100.,   150.],
        [ 1000.,    35.,   365., -2450., -2000.,  1100.],
        ...,
        [ 9000.,   125.,   365., 16751., 18751., 54625.],
        [ 9000.,   165.,   365., 17650., 20001., 54625.],
        [ 9000.,   165.,   365., 19001., 22001., 64001.]]),
 array([-2870., -2870., -2550., ..., 54625., 54625., 64001.]))

In [39]:
# to make numbers real 
np.set_printoptions(suppress=True)

In [43]:
# to arrange items descending order you have to write two negatives

-np.sort(-lending_co_data_numeric)

array([[13621.,  4241.,  3121.,  2000.,   365.,    40.],
       [15041.,  4171.,  3061.,  2000.,   365.,    40.],
       [15340.,  3280.,  2160.,  1000.,   365.,    40.],
       ...,
       [16600.,  5001.,  4201.,  2000.,   365.,    40.],
       [15600.,  3320.,  2080.,  1000.,   365.,    40.],
       [16600.,  4601.,  4601.,  2000.,   365.,    40.]])

In [44]:
# to sort a certain column only in place
lending_co_data_numeric[:, 3].sort()

## Argument Functions

### np.argsort()

In [46]:
lending_co_data_numeric = np.loadtxt('Lending-company-Numeric.csv', delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [48]:
np.argsort(lending_co_data_numeric), np.sort(lending_co_data_numeric)

(array([[1, 2, 0, 3, 4, 5],
        [1, 2, 0, 3, 4, 5],
        [1, 2, 0, 3, 4, 5],
        ...,
        [1, 2, 0, 3, 4, 5],
        [1, 2, 0, 3, 4, 5],
        [1, 2, 0, 3, 4, 5]], dtype=int64),
 array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
        [   40.,   365.,  2000.,  3061.,  4171., 15041.],
        [   40.,   365.,  1000.,  2160.,  3280., 15340.],
        ...,
        [   40.,   365.,  2000.,  4201.,  5001., 16600.],
        [   40.,   365.,  1000.,  2080.,  3320., 15600.],
        [   40.,   365.,  2000.,  4601.,  4601., 16600.]]))

In [50]:
np.argsort(lending_co_data_numeric, axis=0), np.sort(lending_co_data_numeric, axis=0)

(array([[ 537,  443,    0,   32,   32,  482],
        [ 639,  327,  687,  166,  166,  493],
        [ 849,  432,  688,   85,   85,  166],
        ...,
        [  27,  326,  355,  568, 1019,  568],
        [ 277,   27,  357,  718, 1033,  534],
        [ 420,  408, 1042,  912,  912,   27]], dtype=int64),
 array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
        [ 1000.,    35.,   365., -2550., -2100.,   150.],
        [ 1000.,    35.,   365., -2450., -2000.,  1100.],
        ...,
        [ 9000.,   125.,   365., 16751., 18751., 54625.],
        [ 9000.,   165.,   365., 17650., 20001., 54625.],
        [ 9000.,   165.,   365., 19001., 22001., 64001.]]))

In [51]:
np.argsort(lending_co_data_numeric[:, 0])

array([537, 639, 849, ...,  27, 277, 420], dtype=int64)

In [56]:
# sorting an array by sorting a specified column in it 
lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:, 0])]
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   125.,   365., 13001., 17001., 54625.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

### np.argwhere()

In [57]:
lending_co_data_numeric = np.loadtxt('Lending-company-Numeric.csv', delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [60]:
# argwhere is checking all the values other than zeros by default
np.argwhere(lending_co_data_numeric)

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]], dtype=int64)

In [64]:
# argwhere is checking all the values other than zeros by default
np.argwhere(lending_co_data_numeric == False), np.argwhere(lending_co_data_numeric %2 ==0)

(array([[116,   4],
        [430,   3]], dtype=int64),
 array([[   0,    0],
        [   0,    1],
        [   1,    0],
        ...,
        [1042,    0],
        [1042,    1],
        [1042,    5]], dtype=int64))

In [66]:
np.isnan(lending_co_data_numeric).sum()

0

In [71]:
lending_co_data_numeric_nan = np.genfromtxt('Lending-company-Numeric-NAN.csv', delimiter = ';')
lending_co_data_numeric_nan

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [74]:
np.argwhere(np.isnan(lending_co_data_numeric_nan))

array([[  11,    3],
       [  15,    3],
       [  27,    3],
       [  58,    3],
       [  60,    4],
       [  85,    4],
       [ 117,    5],
       [ 152,    1],
       [ 152,    2],
       [ 152,    4],
       [ 172,    1],
       [ 175,    1],
       [ 175,    2],
       [ 176,    3],
       [ 177,    4],
       [ 178,    5],
       [ 211,    3],
       [ 229,    0],
       [ 230,    1],
       [ 237,    1],
       [ 247,    3],
       [ 251,    5],
       [ 252,    4],
       [ 258,    1],
       [ 260,    3],
       [ 262,    4],
       [ 271,    5],
       [ 272,    4],
       [ 284,    2],
       [ 284,    3],
       [ 297,    1],
       [ 297,    2],
       [ 300,    3],
       [ 315,    3],
       [ 315,    5],
       [ 327,    4],
       [ 336,    4],
       [ 343,    0],
       [ 344,    2],
       [ 346,    2],
       [ 363,    3],
       [ 375,    3],
       [ 377,    2],
       [ 398,    5],
       [ 416,    4],
       [ 428,    0],
       [ 432,    1],
       [ 433,

In [76]:
lending_co_data_numeric_nan[11,3]

nan

In [78]:
# to replace every nan with zero
for array_index in np.argwhere(np.isnan(lending_co_data_numeric_nan)):
    lending_co_data_numeric_nan[array_index[0], array_index[1]] = 0

## Shuffling Data

## Casting

## Stripping Data

## Stacking

## Concatenate

## Unique 