### Processing Data With Numpy

### Importing Data With  numpy

In [1]:
import numpy as np

##### np.loadtxt() vs np.genformtxt()

In [2]:
lending_company_data_numeric_1 = np.loadtxt("Lending-Company-Numeric-Data.csv", delimiter=',')
lending_company_data_numeric_1

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [3]:
lending_company_data_numeric_2 = np.genfromtxt("Lending-Company-Numeric-Data.csv", delimiter=',')
lending_company_data_numeric_2

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:
# To check if the arrays are equal or not
np.array_equal(lending_company_data_numeric_1,lending_company_data_numeric_2)

True

In [6]:
# Why does it matter which method should we use
# load method is faster but it breaks when we feed it incomplete or ill-formatted datasets
# where generatefromtext method is slower but can handle the missing values
# NAN = Not a number, Refers to missing values within a Numpy array

In [9]:
lending_company_data_numeric_1_NAN = np.loadtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';')

ValueError: could not convert string to float: ''

In [8]:
#ValueError: could not convert string to float: '' means Python encounters a symbol when it expects a number
# missing values =""(empty space) => empty space count as a symbol
# To ways to solve this error
# use genfromtxt method or 
# 

In [13]:
lending_company_data_numeric_1_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';')
lending_company_data_numeric_1_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [16]:
#or specify the datatype as string
# We can use this way of importing dataset only if we want to observe the data values that have been stored in it, 
#and we don't need to execute any mathematical operation
# They are saved as plain text rather than numbers
# All the values now have single quotes around them
# That's why it's crucial to make sure that we import our data in the most appropriate type
lending_company_data_numeric_1_NAN = np.loadtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';', dtype=np.str_)
lending_company_data_numeric_1_NAN


array([['2000', '40', '365', '3121', '4241', '13621'],
       ['2000', '40', '365', '3061', '4171', '15041'],
       ['1000', '40', '365', '2160', '3280', '15340'],
       ...,
       ['', '40', '365', '4201', '5001', '16600'],
       ['1000', '40', '365', '2080', '3320', '15600'],
       ['2000', '40', '365', '4601', '4601', '16600']], dtype='<U5')

### Partial cleaning of data when important 

In [21]:
lending_company_data_numeric_1_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';')
lending_company_data_numeric_1_NAN


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [22]:
# skip_header =2 should remove the first two line of the dataset, it can be any number
lending_company_data_numeric_1_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';', 
                                                                                           skip_header=2)
lending_company_data_numeric_1_NAN

array([[ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [23]:
# skip_footer =2 should remove the first two line of the dataset from the bottom, it can be any number
lending_company_data_numeric_1_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';', 
                                                                                           skip_footer=2)
lending_company_data_numeric_1_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  3401.,    nan, 16600.],
       [ 2000.,    40.,   365.,    nan,  5440., 16600.],
       [   nan,    40.,   365.,  4201.,  5001., 16600.]])

In [25]:
# usecols =0  means it will say python we are only interested in the first column
# put all the values we are interested  in parenthesis
# python uses 0 indexing
# we can also change the order of the column 
lending_company_data_numeric_1_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';', 
                                                                                           usecols=(0,1,5))
                                                                                        
lending_company_data_numeric_1_NAN

array([[ 2000.,    40., 13621.],
       [ 2000.,    40., 15041.],
       [ 1000.,    40., 15340.],
       ...,
       [   nan,    40., 16600.],
       [ 1000.,    40., 15600.],
       [ 2000.,    40., 16600.]])

In [26]:
lending_company_data_numeric_1_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';', 
                                                                                           usecols=(0,5,3))
                                                                                       
lending_company_data_numeric_1_NAN

array([[ 2000., 13621.,  3121.],
       [ 2000., 15041.,  3061.],
       [ 1000., 15340.,  2160.],
       ...,
       [   nan, 16600.,  4201.],
       [ 1000., 15600.,  2080.],
       [ 2000., 16600.,  4601.]])

In [27]:
# We can use the simultaneously

In [31]:
lending_company_data_numeric_1_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';', 
                                                                                           usecols=(0,5,3),
                                                                                           skip_header=2,
                                                                                           skip_footer=2)
                                                                                       
lending_company_data_numeric_1_NAN

array([[ 1000., 15340.,  2160.],
       [ 2000., 15321.,  3041.],
       [ 2000., 13720.,  3470.],
       ...,
       [ 2000., 16600.,  3401.],
       [ 2000., 16600.,    nan],
       [   nan, 16600.,  4201.]])

In [32]:
# If we want to split each one/column into individual variable , unpack=True

In [37]:
lending_company_data_0,lending_company_data_5,lending_company_data_3 = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter=';', 
                                                                                           usecols=(0,5,3),
                                                                                           skip_header=2,
                                                                                           skip_footer=2, unpack =True)

print(lending_compay_data_0)
print(lending_company_data_5)
print(lending_company_data_3)
                                                                                       


[1000. 2000. 2000. ... 2000. 2000.   nan]
[15340. 15321. 13720. ... 16600. 16600. 16600.]
[2160. 3041. 3470. ... 3401.   nan 4201.]


In [38]:
# It is important to rememeber that the output is generated and then unpacked according to the order of the usecols argument

### Importing data with Numpy-String vs Object Vs Numbers

In [39]:
lending_co_lt = np.genfromtxt("lending-co-LT.csv", delimiter =',')
lending_co_lt


array([[      nan,       nan,       nan, ...,       nan,       nan,
              nan],
       [1.000e+00,       nan,       nan, ...,       nan,       nan,
        1.660e+04],
       [2.000e+00,       nan,       nan, ...,       nan,       nan,
        1.660e+04],
       ...,
       [1.041e+03,       nan,       nan, ...,       nan,       nan,
        1.660e+04],
       [1.042e+03,       nan,       nan, ...,       nan,       nan,
        1.560e+04],
       [1.043e+03,       nan,       nan, ...,       nan,       nan,
        1.660e+04]])

In [40]:
print(lending_co_lt)

[[      nan       nan       nan ...       nan       nan       nan]
 [1.000e+00       nan       nan ...       nan       nan 1.660e+04]
 [2.000e+00       nan       nan ...       nan       nan 1.660e+04]
 ...
 [1.041e+03       nan       nan ...       nan       nan 1.660e+04]
 [1.042e+03       nan       nan ...       nan       nan 1.560e+04]
 [1.043e+03       nan       nan ...       nan       nan 1.660e+04]]


In [41]:
lending_co_lt = np.genfromtxt("lending-co-LT.csv", delimiter =',', dtype=np.int32)

In [42]:
## By specifting the input data to be a specific type , the funtion genertaes the missing values diffrently
#We need to be careful when we ask python  to perform computations afetr we have imported missing values catagorized as integers
print(lending_co_lt)

[[   -1    -1    -1 ...    -1    -1    -1]
 [    1    -1    -1 ...    -1    -1 16600]
 [    2    -1    -1 ...    -1    -1 16600]
 ...
 [ 1041    -1    -1 ...    -1    -1 16600]
 [ 1042    -1    -1 ...    -1    -1 15600]
 [ 1043    -1    -1 ...    -1    -1 16600]]


In [43]:
lending_co_lt[0,0] +lending_co_lt[0,1]

-2

In [45]:
## Import the entire dataset as text
# We can not add the values up or do other mathematical operations
# However diffrent elements within the array can still be sorted , cut and formatted
lending_co_lt = np.genfromtxt("lending-co-LT.csv", delimiter =',', dtype=np.str_)

In [46]:
print(lending_co_lt)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [47]:
## Importing the data as object

In [49]:
lending_co_lt = np.genfromtxt("lending-co-LT.csv", delimiter =',', dtype=object)

In [50]:
print(lending_co_lt) # b indicate that data inside is not just plain text so we can freely manupulate the values

[[b'LoanID' b'StringID' b'Product' ... b'Location' b'Region'
  b'TotalPrice']
 [b'1' b'id_1' b'Product B' ... b'Location 2' b'Region 2' b'16600.0']
 [b'2' b'id_2' b'Product B' ... b'Location 3' b'' b'16600.0']
 ...
 [b'1041' b'id_1041' b'Product B' ... b'Location 23' b'Region 4'
  b'16600.0']
 [b'1042' b'id_1042' b'Product C' ... b'Location 52' b'Region 6'
  b'15600.0']
 [b'1043' b'id_1043' b'Product B' ... b'Location 142' b'Region 6'
  b'16600.0']]


In [51]:
## We can also import the dataset as an array of multiple types
#The number of datatypes is determined by the number of columns or field in the dataset
# Array usally consist of a single numeric datatype
# We should avoid specifying variuos datatypes when working with the Numpy package

In [56]:
lending_co_lt = np.genfromtxt("lending-co-LT.csv", delimiter =',', dtype=(np.int32, np.str_,np.str_,np.str_,np.str_,np.str_,np.int32))

In [57]:
print(lending_co_lt)

[(  -1, '', '', '', '', '',    -1) (   1, '', '', '', '', '', 16600)
 (   2, '', '', '', '', '', 16600) ... (1041, '', '', '', '', '', 16600)
 (1042, '', '', '', '', '', 15600) (1043, '', '', '', '', '', 16600)]


## Saving Data With Numpy-NPY


In [59]:
lending_co = np.genfromtxt("Lending-Company-Saving.csv", delimiter=',', dtype =np.str_)
print(lending_co)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [60]:
np.save("Lending-Company-Saving", lending_co)

In [61]:
lending_data_save =np.load("Lending-Company-Saving.npy")

In [62]:
print(lending_data_save)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [63]:
#To check if both dataset are identical
np.array_equal(lending_data_save,lending_co)

True

In [64]:
# This means the data is unaltered after we saved it and loaded it back in

### Saving Data With Numpy-NPZ

In [65]:
lending_co = np.genfromtxt("Lending-Company-Saving.csv", delimiter=',', dtype =np.str_)
print(lending_co)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [66]:
np.savez("Lending-Company-Saving", lending_co) # we can use  lending_data_save as an input

In [67]:
np.savez("Lending-Company-Saving", lending_co, lending_data_save)

In [68]:
lending_data_savez =np.load("Lending-Company-Saving.npz")

In [69]:
print(lending_data_savez) # Why is that so
#beacuse npz contained a collection of arrays. It can't be properly displayed this way
# We must specify the array we want to open

<numpy.lib.npyio.NpzFile object at 0x000001B4FFEF9940>


In [70]:
print(lending_data_savez["arr_0"]) # for index 0 array

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [71]:
print(lending_data_savez["arr_1"]) # for second array

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [73]:
## This is beacuse by f=default NPZ files stores each dataset as a separate array witha gneric name
## We we want to set distinguble name for each one we do so by defining keyword argument for each array

In [74]:
np.savez("Lending-Company-Saving",company= lending_co, data_save=lending_data_save)

In [75]:
lending_data_savez =np.load("Lending-Company-Saving.npz")

In [76]:
lending_data_savez.files

['company', 'data_save']

In [77]:
print(lending_data_savez["company"])

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [78]:
print(lending_data_savez["data_save"])

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [80]:
np.array_equal(lending_data_savez["company"],lending_data_savez["data_save"])# The data remains the same even after saving and reloading it

True

## Saving Data With Numpy - CSV

In [81]:
lending_co = np.genfromtxt("Lending-Company-Saving.csv", delimiter=',', dtype =np.str_)
print(lending_co)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [82]:
# We have to specify the file extension during saving
np.savetxt("Lending-Company-Saving.txt", lending_co, fmt='%s', delimiter=',') 

In [83]:
# np.savetxt requires importing the file rather than loading it

In [87]:
lending_data_savetxt =np.genfromtxt("Lending-Company-Saving.txt",delimiter=',', dtype=np.str_)

print(lending_data_savetxt)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [88]:
lending_data_save =np.load("Lending-Company-Saving.npy")

In [89]:
np.array_equal(lending_data_savetxt, lending_data_save)

True