# Numpy and Pandas

## Numpy

<b> Importing the numpy library

In [2]:
import numpy as np

<b> Creating an array

In [5]:
# This is a 1D array

array_1d = np.array([1.2,3.4,5.6,7.8,8.9,9.1,10.11,12.13,14.15])
print(array_1d)

[ 1.2   3.4   5.6   7.8   8.9   9.1  10.11 12.13 14.15]


In [7]:
# This is 2D array

array_2d = np.array([[1,2,3],[4,5,6],[7,8,9]])
print(array_2d)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


To populate a matrix with all zeroes, call `np.zeros`. To populate a matrix with all ones, call `np.ones`.

<b> Populate array with a range of numbers

In [14]:
array_range = np.arange(1,10)
print(array_range)

# The upper bound will not be included

[1 2 3 4 5 6 7 8 9]


### Array with random numbers

In [17]:
random_array = np.random.randint(low = 20, high = 50, size=(10))
print(random_array)
# Here too the upper bound is not included

[37 41 25 25 38 39 35 25 32 28]


In [20]:
# random numbers between 0 and 1

rand_array = np.random.random(size = (6))
print(rand_array)

[0.37022826 0.03267239 0.3670345  0.56677274 0.47153746 0.26612871]


### Some mathematical operations
Numpy has a very useful feature called `broadcasting` that expand smaller operands to the dimension of larger operand to follow linear algebra rules

In [23]:
# Adds a constant to every element in the array

rand_array_bw_2_and_3 = rand_array + 2.0
print(rand_array_bw_2_and_3)

[2.37022826 2.03267239 2.3670345  2.56677274 2.47153746 2.26612871]


In [27]:
# Similarly

print(random_array)
print(random_array * 5)

[37 41 25 25 38 39 35 25 32 28]
[185 205 125 125 190 195 175 125 160 140]


### Creating a linear dataset for example

In [33]:
feature = np.arange(6,21)
print(feature)

[ 6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


In [35]:
label = (feature * 3) + 4
print(label)

[22 25 28 31 34 37 40 43 46 49 52 55 58 61 64]


### Adding some noise to make it realistic

In [44]:
noise = np.random.randint(low = -2,high = +2,size = (15))
print(noise)

# For noise within -2 to +2
noise2 = (np.random.random([15]) * 4) - 2
print(noise2)

[ 0 -1  0  1 -1  1 -1 -2 -1  1 -2  0 -1 -2  1]
[ 0.40260668 -0.65093458 -1.85348166  1.91994143  0.20271628 -0.07171596
  0.23331182  0.15328453  0.33765869 -1.25135821 -0.19290535 -1.88503105
 -0.03921831 -1.86362594 -0.20268356]


In [47]:
label = label + noise2
print(label)

[22.22371289 22.41774707 23.80665232 36.00700911 39.0575707  37.98863029
 39.62783324 42.57115272 48.18941948 47.71898869 53.60523329 55.08869762
 62.8800404  58.00183008 64.14882897]


## Pandas

A DataFrame is similar to an in-memory spreadsheet. Like a spreadsheet:

  * A DataFrame stores data in cells. 
  * A DataFrame has named columns (usually) and numbered rows.

In [49]:
import pandas as pd

### Creting a dataframe

#### Making data for our dataframe. Use list comprehension for clean code

In [90]:
# data = np.random.randint(2,40,10)
# data = np.array([np.random.randint(2,40,10),np.random.randint(30,40,10)])

# Better to use list comprehension to generate the data 
# if you dont want to manually enter it.
data = [[i,j] for i in np.random.randint(2,40,10) for j in np.random.randint(20,40,10)]
print(data)


[[20, 31], [20, 35], [20, 38], [20, 24], [20, 20], [20, 23], [20, 27], [20, 37], [20, 24], [20, 32], [24, 36], [24, 31], [24, 22], [24, 36], [24, 25], [24, 34], [24, 31], [24, 26], [24, 29], [24, 31], [19, 36], [19, 34], [19, 37], [19, 36], [19, 32], [19, 20], [19, 21], [19, 32], [19, 23], [19, 37], [6, 30], [6, 24], [6, 37], [6, 27], [6, 26], [6, 29], [6, 27], [6, 31], [6, 35], [6, 24], [6, 23], [6, 27], [6, 24], [6, 24], [6, 26], [6, 20], [6, 29], [6, 21], [6, 36], [6, 28], [3, 23], [3, 22], [3, 26], [3, 23], [3, 23], [3, 25], [3, 37], [3, 29], [3, 37], [3, 34], [28, 32], [28, 38], [28, 34], [28, 32], [28, 27], [28, 30], [28, 25], [28, 24], [28, 25], [28, 23], [18, 35], [18, 34], [18, 21], [18, 21], [18, 27], [18, 26], [18, 31], [18, 27], [18, 26], [18, 37], [8, 38], [8, 25], [8, 28], [8, 34], [8, 37], [8, 32], [8, 24], [8, 31], [8, 32], [8, 36], [24, 30], [24, 30], [24, 21], [24, 34], [24, 28], [24, 27], [24, 34], [24, 20], [24, 39], [24, 20]]


In [91]:
# This means 3 rows with 4 columns of data
data2 = np.random.randint(20,40,(100,2))
print(data2)

[[36 31]
 [25 33]
 [35 33]
 [31 33]
 [26 25]
 [24 38]
 [37 28]
 [30 24]
 [26 39]
 [24 39]
 [32 39]
 [37 30]
 [36 33]
 [32 23]
 [29 30]
 [29 27]
 [29 26]
 [32 35]
 [28 31]
 [39 31]
 [25 30]
 [34 22]
 [38 25]
 [26 35]
 [37 22]
 [31 39]
 [33 23]
 [32 38]
 [35 28]
 [26 33]
 [20 21]
 [24 25]
 [22 23]
 [24 23]
 [28 28]
 [25 30]
 [37 24]
 [38 32]
 [39 24]
 [33 30]
 [34 29]
 [29 28]
 [30 28]
 [25 30]
 [36 32]
 [21 27]
 [25 32]
 [31 28]
 [30 30]
 [39 33]
 [21 33]
 [22 30]
 [20 29]
 [23 23]
 [29 26]
 [31 32]
 [33 25]
 [39 30]
 [35 37]
 [23 30]
 [20 33]
 [32 31]
 [24 34]
 [32 21]
 [24 35]
 [31 34]
 [21 26]
 [37 26]
 [37 38]
 [32 21]
 [37 39]
 [23 28]
 [28 39]
 [37 39]
 [30 37]
 [21 21]
 [22 39]
 [37 26]
 [20 39]
 [27 24]
 [38 27]
 [37 20]
 [26 37]
 [29 29]
 [25 22]
 [39 27]
 [28 29]
 [29 25]
 [36 35]
 [37 39]
 [36 30]
 [31 24]
 [27 39]
 [23 20]
 [37 38]
 [20 28]
 [27 25]
 [29 22]
 [24 27]
 [30 38]]


#### Making the columns, creating dataframe and displaying it

In [93]:
columns = ['temp','activity']
dataframe = pd.DataFrame(data = data2, columns = columns)
print(dataframe)

    temp  activity
0     36        31
1     25        33
2     35        33
3     31        33
4     26        25
..   ...       ...
95    20        28
96    27        25
97    29        22
98    24        27
99    30        38

[100 rows x 2 columns]


#### Creating a new column

In [94]:
dataframe["adjusted"] = dataframe["activity"] *4
print(dataframe)

    temp  activity  adjusted
0     36        31       124
1     25        33       132
2     35        33       132
3     31        33       132
4     26        25       100
..   ...       ...       ...
95    20        28       112
96    27        25       100
97    29        22        88
98    24        27       108
99    30        38       152

[100 rows x 3 columns]


### Slicing a dataset

In [95]:
# Print the head
print(dataframe.head())

   temp  activity  adjusted
0    36        31       124
1    25        33       132
2    35        33       132
3    31        33       132
4    26        25       100


In [96]:
# Print the 3 head rows
print(dataframe.head(3))

   temp  activity  adjusted
0    36        31       124
1    25        33       132
2    35        33       132


In [97]:
# print only 5th row
print(dataframe.iloc[[5]])

   temp  activity  adjusted
5    24        38       152


In [98]:
# Print rows from 10th to 20th
print(dataframe[10:21])

    temp  activity  adjusted
10    32        39       156
11    37        30       120
12    36        33       132
13    32        23        92
14    29        30       120
15    29        27       108
16    29        26       104
17    32        35       140
18    28        31       124
19    39        31       124
20    25        30       120


In [99]:
# Print a single column
print(dataframe["temp"])

0     36
1     25
2     35
3     31
4     26
      ..
95    20
96    27
97    29
98    24
99    30
Name: temp, Length: 100, dtype: int32


<b> If you assign a dataframe to a new variable the reference will be stored and any changes made will be copied to the original one
<br>
<b> Use `pd.DataFrame.copy()` to make a true independent copy 

In [101]:
#Example 
copy_dataframe = dataframe.copy()

copy_dataframe = copy_dataframe * 100
print(copy_dataframe)

    temp  activity  adjusted
0   3600      3100     12400
1   2500      3300     13200
2   3500      3300     13200
3   3100      3300     13200
4   2600      2500     10000
..   ...       ...       ...
95  2000      2800     11200
96  2700      2500     10000
97  2900      2200      8800
98  2400      2700     10800
99  3000      3800     15200

[100 rows x 3 columns]
