### Preprocessing with Numpy 

In [3]:
import numpy as np

#### Checking the missing values 

In [3]:
lending_co_data_numeric=np.loadtxt("Lending-Company-Numeric.csv",delimiter=',')

In [5]:
np.isnan(lending_co_data_numeric)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [6]:
np.isnan(lending_co_data_numeric).sum() # 0 confirm our dataset has no missing value

0

In [11]:
lending_co_data_numeric_NAN=np.genfromtxt("Lending-Company-Numeric-NAN.csv",delimiter=';')

In [12]:
np.isnan(lending_co_data_numeric_NAN)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [ True, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [15]:
np.isnan(lending_co_data_numeric_NAN).sum() ## 260 missing value present in dataset

260

In [16]:
lending_co_data_numeric_NAN=np.genfromtxt("Lending-Company-Numeric-NAN.csv",delimiter=';', filling_values=0)

In [17]:
np.isnan(lending_co_data_numeric_NAN).sum()

0

In [1]:
# 0 cpuld be used to numerically represent something meaningful in our dataset
#Try to fill missing values that aren't part of our dataset
# by doing so we can later substitute all such values with sometrhing more appropriate
# one approach to use a number greater than the highest value of the dataset : we can use nanmax()
# For this we need to reload the dataset without 0

In [4]:
lending_co_data_numeric_NAN=np.genfromtxt("Lending-Company-Numeric-NAN.csv",delimiter=';')

In [6]:
temporary_fill= np.nanmax(lending_co_data_numeric_NAN).round(2)+1

In [8]:
lending_co_data_numeric_NAN=np.genfromtxt("Lending-Company-Numeric-NAN.csv",delimiter=';', filling_values=temporary_fill)



In [9]:
np.isnan(lending_co_data_numeric_NAN)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

## Substitute Filler values

In [10]:
lending_co_data_numeric_NAN=np.genfromtxt("Lending-Company-Numeric-NAN.csv",delimiter=';')
lending_co_data_numeric_NAN


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [11]:
temporary_mean =np.nanmean(lending_co_data_numeric_NAN, axis=0).round(2)# We want to keep track of the diffrent means beacuse 
# they  can cahnge after filling out the missing elements # mean for each row

In [20]:
temporary_mean [0]

2250.25

In [16]:
temporary_fill= np.nanmax(lending_co_data_numeric_NAN).round(2)+1
lending_co_data_numeric_NAN=np.genfromtxt("Lending-Company-Numeric-NAN.csv",delimiter=';', filling_values=temporary_fill)


In [18]:
temporary_fill
    

64002.0

In [19]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)

4263.25

In [21]:
temporary_mean [0]

2250.25

In [27]:
## This is the most logistic way to fill missing values

lending_co_data_numeric_NAN[:,0]=np.where(lending_co_data_numeric_NAN[:,0]==temporary_fill,
                                          temporary_mean[0],
                                          lending_co_data_numeric_NAN[:,0])


In [28]:
# Whenever we add the mean of a set to itself, the mean of the new set stays the same
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)

2250.25

In [32]:
## Now we will use an iteration that goes thorugh all the columns of the dataset one by one 
# We write 1 here beacuse 1 gives us the number of columns while 0 gives us the number of rows
for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i]=np.where(lending_co_data_numeric_NAN[:,i]==temporary_fill,
                                          temporary_mean[i],
                                          lending_co_data_numeric_NAN[:,i])


In [33]:
# np.where() can also be used to replace non missing value as well
# e.g if we want to replace negative value
for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i]=np.where(lending_co_data_numeric_NAN[:,i]<0,
                                          0,
                                          lending_co_data_numeric_NAN[:,i])



### RESHAPING ARRAYS

In [35]:
lending_co_data_numeric =np.loadtxt("Lending-company-Numeric.csv", delimiter =',')

In [36]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [37]:
lending_co_data_numeric.shape

(1043, 6)

In [38]:
# flip the number of rows and column
np.reshape(lending_co_data_numeric,(6,1043))

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [39]:
#First row: The first 1043 values of the flattened array
#Second row: The second 1043 values of the flattened array
# norally np.transpose is used to flip the array

np.transpose(lending_co_data_numeric)

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

In [40]:
np.reshape(lending_co_data_numeric,(3,2086))  # (3*2086or any multiple)it should be equal to the multiple of (6*1043)

array([[ 2000.,    40.,   365., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  4601.,  4601., 16600.]])

In [41]:
## Adding dimension: Useful when a method or function only takes inputs with a higher number of dimensions than the array 
#we want to plug in
# Reshalping does not immediately alter the dataset 

np.reshape(lending_co_data_numeric,(2,3,1043))

array([[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
        [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
        [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

       [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
        [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
        [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]])

In [42]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [43]:
# If we want to use rashaped array multiple times it's a good idea to store it first_2 =
lending_co_data_numeric_2 =np.reshape(lending_co_data_numeric,(6,1043))
lending_co_data_numeric_2

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

### REMOVING VALUES

In [44]:
lending_co_data_numeric =np.loadtxt("Lending-company-Numeric.csv", delimiter =',')

In [45]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [47]:
np.delete(lending_co_data_numeric,0).shape # it will remove only the first value

(6257,)

In [48]:
lending_co_data_numeric.size

6258

In [49]:
lending_co_data_numeric # if we want to work with new data after deleting we need to save it to a new variable

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [51]:
# If we want to get rid of entire rows or column we can do it by passing value to the axis argument

In [52]:
np.delete(lending_co_data_numeric,0, axis=0) # it will remove the first row of the matrix

array([[ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [53]:
np.delete(lending_co_data_numeric,0, axis=1) # it will remove the first column of the matrix

array([[   40.,   365.,  3121.,  4241., 13621.],
       [   40.,   365.,  3061.,  4171., 15041.],
       [   40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  4201.,  5001., 16600.],
       [   40.,   365.,  2080.,  3320., 15600.],
       [   40.,   365.,  4601.,  4601., 16600.]])

In [54]:
np.delete(lending_co_data_numeric,1, axis=1) # it will remove the second column of the matrix

array([[ 2000.,   365.,  3121.,  4241., 13621.],
       [ 2000.,   365.,  3061.,  4171., 15041.],
       [ 1000.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,   365.,  4201.,  5001., 16600.],
       [ 1000.,   365.,  2080.,  3320., 15600.],
       [ 2000.,   365.,  4601.,  4601., 16600.]])

In [55]:
# To remove multile column we have to pass the value as a tuple/list

In [56]:
np.delete(lending_co_data_numeric, (0,1,4), axis=1) # it will remove the 1st, 3rd and 5th column of the matrix

array([[  365.,  3121., 13621.],
       [  365.,  3061., 15041.],
       [  365.,  2160., 15340.],
       ...,
       [  365.,  4201., 16600.],
       [  365.,  2080., 15600.],
       [  365.,  4601., 16600.]])

In [60]:
# To remove both rows and column simultaneously
np.delete(np.delete(lending_co_data_numeric, [0,2,4], axis=1),[0,2,-1], axis =0)

array([[   40.,  3061., 15041.],
       [   40.,  3041., 15321.],
       [   50.,  3470., 13720.],
       ...,
       [   40.,  4240., 16600.],
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.]])

### Sorting Arrays 

In [64]:
import numpy as np

In [2]:
lending_co_data_numeric =np.loadtxt("Lending-company-Numeric.csv", delimiter =',')

In [3]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [5]:
np.sort(lending_co_data_numeric).shape # without specifying any other arguments, default axis=-1

(1043, 6)

In [6]:
lending_co_data_numeric.shape # both arrays have the same shape

(1043, 6)

In [10]:
np.sort(lending_co_data_numeric, axis=0)

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [9]:
# we can tell numpy to refrain from using scientific notation
np.set_printoptions(suppress=True) # This setting apply to our entire work rather than a single cell

In [11]:
np.sort(lending_co_data_numeric, axis=None) #axis =None informs the function to work with the flatted version of the 2-D input
                                            # The flattened array is 1D then so is its sorted version

array([-2870., -2870., -2550., ..., 54625., 54625., 64001.])

In [12]:
#Numpy sort function doesn't have a parameter that automatically changes the order from increasing to decreasing

In [13]:
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [14]:
np.sort(-lending_co_data_numeric)# "-" literraly changes the sign of every individual element of the input variable
                                # equivalent to muliplying by -1
                                # The matrix we see in the screen isn't the origial array sorted in decreasing order
                                # To do this we have to add another - sign

array([[-13621.,  -4241.,  -3121.,  -2000.,   -365.,    -40.],
       [-15041.,  -4171.,  -3061.,  -2000.,   -365.,    -40.],
       [-15340.,  -3280.,  -2160.,  -1000.,   -365.,    -40.],
       ...,
       [-16600.,  -5001.,  -4201.,  -2000.,   -365.,    -40.],
       [-15600.,  -3320.,  -2080.,  -1000.,   -365.,    -40.],
       [-16600.,  -4601.,  -4601.,  -2000.,   -365.,    -40.]])

In [15]:
-np.sort(-lending_co_data_numeric) # We have successfully sorted eaxh line of the initial matrix in Decending order

array([[13621.,  4241.,  3121.,  2000.,   365.,    40.],
       [15041.,  4171.,  3061.,  2000.,   365.,    40.],
       [15340.,  3280.,  2160.,  1000.,   365.,    40.],
       ...,
       [16600.,  5001.,  4201.,  2000.,   365.,    40.],
       [15600.,  3320.,  2080.,  1000.,   365.,    40.],
       [16600.,  4601.,  4601.,  2000.,   365.,    40.]])

In [16]:
## For individal oclumn / row

In [17]:
lending_co_data_numeric


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [18]:
np.sort(lending_co_data_numeric[:,3]) # sorting only 4th column , this does not overwrite the original array

array([-2870., -2550., -2450., ..., 16751., 17650., 19001.])

In [19]:
lending_co_data_numeric[:,3].sort() # This way overwrite the original array

In [20]:
lending_co_data_numeric

array([[ 2000.,    40.,   365., -2870.,  4241., 13621.],
       [ 2000.,    40.,   365., -2550.,  4171., 15041.],
       [ 1000.,    40.,   365., -2450.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., 16751.,  5001., 16600.],
       [ 1000.,    40.,   365., 17650.,  3320., 15600.],
       [ 2000.,    40.,   365., 19001.,  4601., 16600.]])

In [21]:
lending_co_data_numeric.sort(axis=0) # To sort entire dataset

In [22]:
lending_co_data_numeric

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

## Argument Function

In [23]:
lending_co_data_numeric =np.loadtxt("Lending-company-Numeric.csv", delimiter =',')

In [24]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [25]:
np.argsort(lending_co_data_numeric)

array([[1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       ...,
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5]], dtype=int64)

In [26]:
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [27]:
np.argsort(lending_co_data_numeric, axis=0)

array([[ 537,  443,    0,   32,   32,  482],
       [ 639,  327,  687,  166,  166,  493],
       [ 849,  432,  688,   85,   85,  166],
       ...,
       [  27,  326,  355,  568, 1019,  568],
       [ 277,   27,  357,  718, 1033,  534],
       [ 420,  408, 1042,  912,  912,   27]], dtype=int64)

In [28]:
lending_co_data_numeric[482,5]

-350.0

In [29]:
np.sort(lending_co_data_numeric, axis=0)

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [30]:
np.argsort(lending_co_data_numeric[:,0])

array([537, 639, 849, ...,  27, 277, 420], dtype=int64)

In [32]:
lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:,0])]
lending_co_data_numeric # Successfully rearranged all the values based on the first position

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2240.,  3680., 15600.],
       [ 1000.,    40.,   365.,  2575.,  3635., 15600.],
       ...,
       [ 9000.,   125.,   365., 13001., 16726., 54625.],
       [ 9000.,   125.,   365., 10001., 10501., 24126.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

In [33]:
lending_co_data_numeric.argsort(axis=0)

array([[   0,  216,    0,  106,  106,   18],
       [ 155,  130,  687,  145,  145,    6],
       [ 156,  176,  688,   15,   15,  145],
       ...,
       [1022, 1042,  355, 1038, 1026, 1040],
       [1031, 1024,  357,  871, 1033, 1033],
       [1042, 1023, 1042, 1035, 1035, 1023]], dtype=int64)

In [34]:
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2240.,  3680., 15600.],
       [ 1000.,    40.,   365.,  2575.,  3635., 15600.],
       ...,
       [ 9000.,   125.,   365., 13001., 16726., 54625.],
       [ 9000.,   125.,   365., 10001., 10501., 24126.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

### Argument Function _ WHERE 

In [35]:
lending_co_data_numeric =np.loadtxt("Lending-company-Numeric.csv", delimiter =',')

In [36]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [37]:
np.argwhere(lending_co_data_numeric)

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]], dtype=int64)

In [38]:
np.argwhere(lending_co_data_numeric==False) # This will find the location of zero in the dataset

array([[116,   4],
       [430,   3]], dtype=int64)

In [39]:
lending_co_data_numeric[116]

array([ 1000.,    50.,   365., -1450.,     0., 13850.])

In [40]:
lending_co_data_numeric[430]

array([1000.,   50.,  365.,    0.,  550., 5650.])

In [41]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [42]:
np.argwhere(lending_co_data_numeric>1000)

array([[   0,    0],
       [   0,    3],
       [   0,    4],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]], dtype=int64)

In [43]:
np.argwhere(lending_co_data_numeric<1000)

array([[   0,    1],
       [   0,    2],
       [   1,    1],
       ...,
       [1041,    2],
       [1042,    1],
       [1042,    2]], dtype=int64)

In [44]:
np.argwhere(lending_co_data_numeric%2==0)

array([[   0,    0],
       [   0,    1],
       [   1,    0],
       ...,
       [1042,    0],
       [1042,    1],
       [1042,    5]], dtype=int64)

In [46]:
# We can use this function to separate only the elements that interest us and examine just them
#This function is very similar to the filtering relatd to ocnditional slicing
#Slicing gives us the actual values
#np.argwhere() returns their coordinates within the array

In [47]:
np.isnan(lending_co_data_numeric).sum() 

0

In [50]:
Lending_Company_Numeric_Data_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';')
Lending_Company_Numeric_Data_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [51]:
np.isnan(Lending_Company_Numeric_Data_NAN)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [ True, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [56]:
np.argwhere(np.isnan(Lending_Company_Numeric_Data_NAN))

array([[  11,    3],
       [  15,    3],
       [  27,    3],
       [  58,    3],
       [  60,    4],
       [  85,    4],
       [ 117,    5],
       [ 152,    1],
       [ 152,    2],
       [ 152,    4],
       [ 172,    1],
       [ 175,    1],
       [ 175,    2],
       [ 176,    3],
       [ 177,    4],
       [ 178,    5],
       [ 211,    3],
       [ 229,    0],
       [ 230,    1],
       [ 237,    1],
       [ 247,    3],
       [ 251,    5],
       [ 252,    4],
       [ 258,    1],
       [ 260,    3],
       [ 262,    4],
       [ 271,    5],
       [ 272,    4],
       [ 284,    2],
       [ 284,    3],
       [ 297,    1],
       [ 297,    2],
       [ 300,    3],
       [ 315,    3],
       [ 315,    5],
       [ 327,    4],
       [ 336,    4],
       [ 343,    0],
       [ 344,    2],
       [ 346,    2],
       [ 363,    3],
       [ 375,    3],
       [ 377,    2],
       [ 398,    5],
       [ 416,    4],
       [ 428,    0],
       [ 432,    1],
       [ 433,

In [54]:
Lending_Company_Numeric_Data_NAN[152]

array([ 9000.,    nan,    nan, 12251.,    nan, 34514.])

In [58]:
for array_index in np.argwhere(np.isnan(Lending_Company_Numeric_Data_NAN)):
    Lending_Company_Numeric_Data_NAN[array_index[0],array_index[1]]=0
    

In [60]:
Lending_Company_Numeric_Data_NAN[152]

array([ 9000.,     0.,     0., 12251.,     0., 34514.])

In [61]:
# We successfully found another way to take care of missing entries in the data

In [62]:
np.isnan(Lending_Company_Numeric_Data_NAN).sum()

0

In [63]:
# 0 indicates there are no longer missing values in this dataset

### Shuffling Data 

In [65]:
lending_co_data_numeric =np.loadtxt("Lending-company-Numeric.csv", delimiter =',')

In [66]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [67]:
np.random.shuffle(lending_co_data_numeric)

In [68]:
lending_co_data_numeric

array([[ 1000.,    50.,   365.,   350.,  1650., 10305.],
       [ 1000.,    40.,   365.,  2080.,  2800.,  5280.],
       [ 4000.,    50.,   365.,  5450.,  6750., 22250.],
       ...,
       [ 2500.,    50.,   365.,  3250.,  4750., 20750.],
       [ 2000.,    40.,   365.,  3121.,  4501., 16381.],
       [ 1000.,    50.,   365.,   450.,  1660., 12230.]])

In [71]:
np.random.shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3401.,  4901., 16600.],
       [ 1000.,    40.,   365.,  3130.,  4430., 15600.],
       [ 2000.,    40.,   365.,  3600.,  4800., 16400.],
       ...,
       [ 2500.,    50.,   365.,  2250.,  3600., 11400.],
       [ 4000.,    50.,   365.,  5780.,  6900., 22250.],
       [ 1000.,    40.,   365.,  2130.,  3510., 14350.]])

In [72]:
lending_co_data_numeric =np.loadtxt("Lending-company-Numeric.csv", delimiter =',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [73]:
# whenever you are using the same function or method many times in your analysis, it's a good idea to directly import it

In [74]:
from numpy.random import shuffle

In [75]:
shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  3500.,  5000., 15600.],
       [ 4000.,    50.,   365.,  4900.,  5650., 10100.],
       [ 1000.,    50.,   365., -1750.,  -400., 13550.],
       ...,
       [ 2000.,    40.,   365.,  3041.,  4241., 16001.],
       [ 1000.,    40.,   365.,  2130.,  3510., 14350.],
       [ 2000.,    40.,   365.,  5201.,  5201., 16600.]])

In [76]:
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg

In [79]:
array_RG =gen(pcg())
array_RG.shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3321.,  4501., 13921.],
       [ 2000.,    40.,   365.,  2861.,  4361., 16600.],
       [ 1000.,    40.,   365.,  2240.,  3360., 15160.],
       ...,
       [ 1000.,    40.,   365.,  2380.,  3300., 14620.],
       [ 2000.,    40.,   365.,  3681.,  4881., 16600.],
       [ 1000.,    40.,   365.,  2105.,  3260., 13125.]])

## Casting Arrays 

In [80]:
lending_co_data_numeric.astype(dtype=np.int32) # there are no decimal points 

array([[ 2000,    40,   365,  3321,  4501, 13921],
       [ 2000,    40,   365,  2861,  4361, 16600],
       [ 1000,    40,   365,  2240,  3360, 15160],
       ...,
       [ 1000,    40,   365,  2380,  3300, 14620],
       [ 2000,    40,   365,  3681,  4881, 16600],
       [ 1000,    40,   365,  2105,  3260, 13125]])

In [81]:
# Creating an array with string values
lending_co_data_numeric=lending_co_data_numeric.astype(dtype=np.str_)
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3321.0', '4501.0', '13921.0'],
       ['2000.0', '40.0', '365.0', '2861.0', '4361.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2240.0', '3360.0', '15160.0'],
       ...,
       ['1000.0', '40.0', '365.0', '2380.0', '3300.0', '14620.0'],
       ['2000.0', '40.0', '365.0', '3681.0', '4881.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2105.0', '3260.0', '13125.0']],
      dtype='<U32')

In [82]:
type(lending_co_data_numeric) #The dataset retains all of it's functionalities

numpy.ndarray

In [83]:
lending_co_data_numeric=lending_co_data_numeric.astype(dtype=np.int32)

ValueError: invalid literal for int() with base 10: '2000.0'

In [84]:
# . not an element recognized among integer. Numpy does not know how to cast it
# Therefore we can't directly cast a string into integer
# We can transform the string into float first than integer

In [87]:
lending_co_data_numeric =lending_co_data_numeric.astype(dtype=np.float32)
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3321.,  4501., 13921.],
       [ 2000.,    40.,   365.,  2861.,  4361., 16600.],
       [ 1000.,    40.,   365.,  2240.,  3360., 15160.],
       ...,
       [ 1000.,    40.,   365.,  2380.,  3300., 14620.],
       [ 2000.,    40.,   365.,  3681.,  4881., 16600.],
       [ 1000.,    40.,   365.,  2105.,  3260., 13125.]], dtype=float32)

In [88]:
lending_co_data_numeric.astype(dtype=np.int32)

array([[ 2000,    40,   365,  3321,  4501, 13921],
       [ 2000,    40,   365,  2861,  4361, 16600],
       [ 1000,    40,   365,  2240,  3360, 15160],
       ...,
       [ 1000,    40,   365,  2380,  3300, 14620],
       [ 2000,    40,   365,  3681,  4881, 16600],
       [ 1000,    40,   365,  2105,  3260, 13125]])

In [89]:
lending_co_data_numeric =np.loadtxt("Lending-company-Numeric.csv", delimiter =',')
lending_co_data_numeric=lending_co_data_numeric.astype(dtype=np.str_)
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [90]:
lending_co_data_numeric.astype(dtype=np.float32).astype(dtype=np.int32) # does not overwrite the original array

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]])

### Stripping Symbol from Arrays

In [99]:
import numpy as np
lending_co_total_price = np.genfromtxt("Lending-Company-Total-Price.csv",
                                       delimiter=',',
                                       dtype= np.str_,
                                       skip_header=1,
                                       usecols=[1,2,4]
                                      )

lending_co_total_price 

array([['id_1', 'Product B', 'Location 2'],
       ['id_2', 'Product B', 'Location 3'],
       ['id_3', 'Product C', 'Location 5'],
       ...,
       ['id_413', 'Product B', 'Location 135'],
       ['id_414', 'Product C', 'Location 200'],
       ['id_415', 'Product A', 'Location 8']], dtype='<U12')

In [101]:
# We only want the columns which contain non numeric data as we can't strip number

In [102]:
np.chararray.strip(lending_co_total_price [:,0],'id_') # from first colum remove id_# does not overwrite the array

chararray(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
           '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
           '23', '24', '25', '26', '27', '28', '29', '30', '31', '32',
           '33', '34', '35', '36', '37', '38', '39', '40', '41', '42',
           '43', '44', '45', '46', '47', '48', '49', '50', '51', '52',
           '53', '54', '55', '56', '57', '58', '59', '60', '61', '62',
           '63', '64', '65', '66', '67', '68', '69', '70', '71', '72',
           '73', '74', '75', '76', '77', '78', '79', '80', '81', '82',
           '83', '84', '85', '86', '87', '88', '89', '90', '91', '92',
           '93', '94', '95', '96', '97', '98', '99', '100', '101', '102',
           '103', '104', '105', '106', '107', '108', '109', '110', '111',
           '112', '113', '114', '115', '116', '117', '118', '119', '120',
           '121', '122', '123', '124', '125', '126', '127', '128', '129',
           '130', '131', '132', '133', '134', '135', '136', '1

In [103]:
np.chararray.strip(lending_co_total_price [:,0],'id_') 
np.chararray.strip(lending_co_total_price [:,1],'Product_') 
np.chararray.strip(lending_co_total_price [:,2],'Location') 
lending_co_total_price

array([['id_1', 'Product B', 'Location 2'],
       ['id_2', 'Product B', 'Location 3'],
       ['id_3', 'Product C', 'Location 5'],
       ...,
       ['id_413', 'Product B', 'Location 135'],
       ['id_414', 'Product C', 'Location 200'],
       ['id_415', 'Product A', 'Location 8']], dtype='<U12')

In [148]:
lending_co_total_price[:,0] =np.chararray.strip(lending_co_total_price [:,0],'id_') 
lending_co_total_price[:,1] =np.chararray.strip(lending_co_total_price [:,1],'Product_')
lending_co_total_price[:,2] =np.chararray.strip(lending_co_total_price [:,2],'Location') 
lending_co_total_price

array([['1', ' B', ' 2'],
       ['2', ' B', ' 3'],
       ['3', ' C', ' 5'],
       ...,
       ['413', ' B', ' 135'],
       ['414', ' C', ' 200'],
       ['415', ' A', ' 8']], dtype='<U12')

In [126]:
# we can also apply np.where() to transform the letters  in the second column into numeric values


In [150]:


# Replace values based on conditions
lending_co_total_price[:, 1] = np.where(lending_co_total_price[:, 1] == 'A', '1', lending_co_total_price[:, 1])
lending_co_total_price[:, 1] = np.where(lending_co_total_price[:, 1] == 'B', '2', lending_co_total_price[:, 1])
lending_co_total_price[:, 1] = np.where(lending_co_total_price[:, 1] == 'C', '3', lending_co_total_price[:, 1])
lending_co_total_price[:, 1] = np.where(lending_co_total_price[:, 1] == 'D', '4', lending_co_total_price[:, 1])
lending_co_total_price[:, 1] = np.where(lending_co_total_price[:, 1] == 'E', '5', lending_co_total_price[:, 1])
lending_co_total_price[:, 1] = np.where(lending_co_total_price[:, 1] == 'F', '6', lending_co_total_price[:, 1])
lending_co_total_price


array([['1', ' B', ' 2'],
       ['2', ' B', ' 3'],
       ['3', ' C', ' 5'],
       ...,
       ['413', ' B', ' 135'],
       ['414', ' C', ' 200'],
       ['415', ' A', ' 8']], dtype='<U12')

In [149]:
#lending_co_total_price = lending_co_total_price.astype(dtype=np.int32)

### STACKING 

In [157]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv",delimiter=',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [158]:
# Recall
lending_co_data_numeric_NAN =np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter=';')
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2)+1
temporary_mean =np.nanmean(lending_co_data_numeric_NAN, axis=0).round(2)
lending_co_data_numeric_NAN =np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter=';',
                                          filling_values=temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,1] =np.where(lending_co_data_numeric_NAN[:,i]==temporary_fill,
                                              temporary_mean[i],
                                              lending_co_data_numeric_NAN[:,i])
lending_co_data_numeric_NAN

array([[ 2000., 13621.,   365.,  3121.,  4241., 13621.],
       [ 2000., 15041.,   365.,  3061.,  4171., 15041.],
       [ 1000., 15340.,   365.,  2160.,  3280., 15340.],
       ...,
       [64002., 16600.,   365.,  4201.,  5001., 16600.],
       [ 1000., 15600.,   365.,  2080.,  3320., 15600.],
       [ 2000., 16600.,   365.,  4601.,  4601., 16600.]])

In [161]:
np.stack((lending_co_data_numeric[:,0],lending_co_data_numeric[:,1])) # first and second column of the array
# here we stack them on top of one another

array([[2000., 2000., 1000., ..., 2000., 1000., 2000.],
       [  40.,   40.,   40., ...,   40.,   40.,   40.]])

In [162]:
np.transpose(lending_co_data_numeric[:,:2]) # first two column 

array([[2000., 2000., 1000., ..., 2000., 1000., 2000.],
       [  40.,   40.,   40., ...,   40.,   40.,   40.]])

In [163]:
# we can stack them side by side
# The axis determines how we stack these array
np.stack((lending_co_data_numeric[:,0],lending_co_data_numeric[:,1]), axis=1)

array([[2000.,   40.],
       [2000.,   40.],
       [1000.,   40.],
       ...,
       [2000.,   40.],
       [1000.,   40.],
       [2000.,   40.]])

In [164]:
# We can stack multiple elements rather than 2 at a time
np.stack((lending_co_data_numeric[:,0],lending_co_data_numeric[:,1],lending_co_data_numeric[:,2]), axis=1)

array([[2000.,   40.,  365.],
       [2000.,   40.,  365.],
       [1000.,   40.,  365.],
       ...,
       [2000.,   40.,  365.],
       [1000.,   40.,  365.],
       [2000.,   40.,  365.]])

In [165]:
lending_co_data_numeric.shape

(1043, 6)

In [166]:
lending_co_data_numeric_NAN.shape

(1043, 6)

In [171]:
np.vstack((lending_co_data_numeric,lending_co_data_numeric_NAN))

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [64002., 16600.,   365.,  4201.,  5001., 16600.],
       [ 1000., 15600.,   365.,  2080.,  3320., 15600.],
       [ 2000., 16600.,   365.,  4601.,  4601., 16600.]])

In [170]:
np.vstack((lending_co_data_numeric,lending_co_data_numeric_NAN)).shape

(2086, 6)

In [172]:
np.hstack((lending_co_data_numeric,lending_co_data_numeric_NAN))

array([[ 2000.,    40.,   365., ...,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365., ...,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365., ...,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., ...,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365., ...,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365., ...,  4601.,  4601., 16600.]])

In [173]:
np.hstack((lending_co_data_numeric,lending_co_data_numeric_NAN)).shape

(1043, 12)

In [174]:
np.dstack((lending_co_data_numeric,lending_co_data_numeric_NAN))

array([[[ 2000.,  2000.],
        [   40., 13621.],
        [  365.,   365.],
        [ 3121.,  3121.],
        [ 4241.,  4241.],
        [13621., 13621.]],

       [[ 2000.,  2000.],
        [   40., 15041.],
        [  365.,   365.],
        [ 3061.,  3061.],
        [ 4171.,  4171.],
        [15041., 15041.]],

       [[ 1000.,  1000.],
        [   40., 15340.],
        [  365.,   365.],
        [ 2160.,  2160.],
        [ 3280.,  3280.],
        [15340., 15340.]],

       ...,

       [[ 2000., 64002.],
        [   40., 16600.],
        [  365.,   365.],
        [ 4201.,  4201.],
        [ 5001.,  5001.],
        [16600., 16600.]],

       [[ 1000.,  1000.],
        [   40., 15600.],
        [  365.,   365.],
        [ 2080.,  2080.],
        [ 3320.,  3320.],
        [15600., 15600.]],

       [[ 2000.,  2000.],
        [   40., 16600.],
        [  365.,   365.],
        [ 4601.,  4601.],
        [ 4601.,  4601.],
        [16600., 16600.]]])

In [175]:
np.dstack((lending_co_data_numeric,lending_co_data_numeric_NAN)).shape

(1043, 6, 2)

In [176]:
np.dstack((lending_co_data_numeric,lending_co_data_numeric_NAN))[0]

array([[ 2000.,  2000.],
       [   40., 13621.],
       [  365.,   365.],
       [ 3121.,  3121.],
       [ 4241.,  4241.],
       [13621., 13621.]])

In [177]:
# The first represent the row
# the following code represent we are slicing all the columns with a row index 0 and depth index 0
# The first row of the original dataset variable
# that means the third index represent which array the values were pulled from

np.dstack((lending_co_data_numeric,lending_co_data_numeric_NAN))[0,:,0]

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.])

In [178]:
# We can replicate it's output with the np.stack function
# because the stack function always returns an output that is exactly 1 dimension more than it's input
# since np.dstack() works along the "third" axis, the two function works identically (for 1-D and 2-d arrays)
# The two functions are only ever equivalent for inputs of up to 2 dimensions

In [179]:
np.stack((lending_co_data_numeric,lending_co_data_numeric_NAN), axis=-1)

array([[[ 2000.,  2000.],
        [   40., 13621.],
        [  365.,   365.],
        [ 3121.,  3121.],
        [ 4241.,  4241.],
        [13621., 13621.]],

       [[ 2000.,  2000.],
        [   40., 15041.],
        [  365.,   365.],
        [ 3061.,  3061.],
        [ 4171.,  4171.],
        [15041., 15041.]],

       [[ 1000.,  1000.],
        [   40., 15340.],
        [  365.,   365.],
        [ 2160.,  2160.],
        [ 3280.,  3280.],
        [15340., 15340.]],

       ...,

       [[ 2000., 64002.],
        [   40., 16600.],
        [  365.,   365.],
        [ 4201.,  4201.],
        [ 5001.,  5001.],
        [16600., 16600.]],

       [[ 1000.,  1000.],
        [   40., 15600.],
        [  365.,   365.],
        [ 2080.,  2080.],
        [ 3320.,  3320.],
        [15600., 15600.]],

       [[ 2000.,  2000.],
        [   40., 16600.],
        [  365.,   365.],
        [ 4601.,  4601.],
        [ 4601.,  4601.],
        [16600., 16600.]]])

### Concatenating Arrays

In [180]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv",delimiter=',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [181]:
np.concatenate((lending_co_data_numeric[0,:],lending_co_data_numeric[1,:])) # add first two rows

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.,  2000.,    40.,
         365.,  3061.,  4171., 15041.])

In [182]:
# Recall
lending_co_data_numeric_NAN =np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter=';')
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2)+1
temporary_mean =np.nanmean(lending_co_data_numeric_NAN, axis=0).round(2)
lending_co_data_numeric_NAN =np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter=';',
                                          filling_values=temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,1] =np.where(lending_co_data_numeric_NAN[:,i]==temporary_fill,
                                              temporary_mean[i],
                                              lending_co_data_numeric_NAN[:,i])
lending_co_data_numeric_NAN

array([[ 2000., 13621.,   365.,  3121.,  4241., 13621.],
       [ 2000., 15041.,   365.,  3061.,  4171., 15041.],
       [ 1000., 15340.,   365.,  2160.,  3280., 15340.],
       ...,
       [64002., 16600.,   365.,  4201.,  5001., 16600.],
       [ 1000., 15600.,   365.,  2080.,  3320., 15600.],
       [ 2000., 16600.,   365.,  4601.,  4601., 16600.]])

In [183]:
np.concatenate((lending_co_data_numeric,lending_co_data_numeric_NAN)) # we have concanetated the second dataset at the bottom of the first

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [64002., 16600.,   365.,  4201.,  5001., 16600.],
       [ 1000., 15600.,   365.,  2080.,  3320., 15600.],
       [ 2000., 16600.,   365.,  4601.,  4601., 16600.]])

In [185]:
np.concatenate((lending_co_data_numeric[0,:],lending_co_data_numeric[:,0]))

array([2000.,   40.,  365., ..., 2000., 1000., 2000.])

In [187]:
np.concatenate((lending_co_data_numeric,lending_co_data_numeric[:,:1]), axis=1)

array([[ 2000.,    40.,   365., ...,  4241., 13621.,  2000.],
       [ 2000.,    40.,   365., ...,  4171., 15041.,  2000.],
       [ 1000.,    40.,   365., ...,  3280., 15340.,  1000.],
       ...,
       [ 2000.,    40.,   365., ...,  5001., 16600.,  2000.],
       [ 1000.,    40.,   365., ...,  3320., 15600.,  1000.],
       [ 2000.,    40.,   365., ...,  4601., 16600.,  2000.]])

## Find Unique Values in Arrays 

In [188]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv",delimiter=',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [189]:
np.unique(lending_co_data_numeric)

array([-2870., -2550., -2450., ..., 52751., 54625., 64001.])

In [190]:
# Unique value for the second column
np.unique(lending_co_data_numeric[:,1])

array([ 35.,  40.,  50., 125., 165.])

In [191]:
#Count how many times present
# Unique value for the second column
# First array shows unique value
# Second array shows how frequency of the value
np.unique(lending_co_data_numeric[:,1], return_counts=True)

(array([ 35.,  40.,  50., 125., 165.]),
 array([  4, 567, 451,  19,   2], dtype=int64))

In [192]:
# we can also determine where it appears first
#second array
#The array with the 0 is the index array
np.unique(lending_co_data_numeric[:,1], return_counts=True, return_index=True)

(array([ 35.,  40.,  50., 125., 165.]),
 array([327,   0,   4,  19,  27], dtype=int64),
 array([  4, 567, 451,  19,   2], dtype=int64))