# Introduction to Python, Numpy and Pandas

# Python Basics

Data Structures

 - Lists: used to store collection of heterogeneous items.
 - Tuples: immutable lists - you cannot delete, add or edit any values inside it.
 - Dictionaries: key-value pairs, key is used to identify the item and the value holds the value of the item.

### List example

In [4]:
same_type_list = [1, 2, 3, 7, 10, 23, 56, 8, 67, 44, 31]
diff_type_list = [8, 37, "hello", 4.9, [1,2]]

print(same_type_list[5]) # index start from 0
print(len(same_type_list)) 

same_type_list[0] = 'new'
print(same_type_list)

new_list = same_type_list + diff_type_list
print(new_list)
new_list_2 = ["hello", "world"] * 2
print(new_list_2)

23
11
['new', 2, 3, 7, 10, 23, 56, 8, 67, 44, 31]
['new', 2, 3, 7, 10, 23, 56, 8, 67, 44, 31, 8, 37, 'hello', 4.9, [1, 2]]
['hello', 'world', 'hello', 'world']


### Slicing 
general format: sliced_list = [start_idx : end_idx+1 : step]

In [12]:
print(same_type_list)
print(same_type_list[5])
print(same_type_list[5:])
print(same_type_list[3:9:2])
print(same_type_list[:3])

some_list = []
for index, item in enumerate(same_type_list):
    if index < 3:
        some_list.append(item)
print(some_list)

some_list2 = [item for index, item in enumerate(same_type_list) if index <3]
print(some_list2)

['new', 2, 3, 7, 10, 23, 56, 8, 67, 44, 31]
23
[23, 56, 8, 67, 44, 31]
[7, 23, 8]
['new', 2, 3]
['new', 2, 3]
['new', 2, 3]


### Tuples example 

In [17]:
same_type_tuple = 1, 2, 3, 4, 5
diff_type_tuple = ("hello", 5.2 , 2) 

print(diff_type_tuple[2])
print(same_type_tuple[1])

#same_type_tuple[0] = 3 # Cannot change values inside a tuple

new_tuple = same_type_tuple + diff_type_tuple
print(new_tuple)
print(new_tuple[:3])

2
2
(1, 2, 3, 4, 5, 'hello', 5.2, 2)
(1, 2, 3)


### Dictionary example

In [18]:
students_dict = {"student1": "Shani", "student2": "Moran", "student3": "Hadar"}

print(students_dict["student1"])
# print(students_dict[0]) # key does not exist
# print(students_dict["student4"]) # key does not exist

print(students_dict.get("student4", "student does not exist"))
print(students_dict.get("student2", "student does not exist"))

print(len(students_dict))
print(students_dict.keys())
print(students_dict.values())

students_dict["student4"] = 1
students_dict

Shani
student does not exist
Moran
3
dict_keys(['student1', 'student2', 'student3'])
dict_values(['Shani', 'Moran', 'Hadar'])


{'student1': 'Shani', 'student2': 'Moran', 'student3': 'Hadar', 'student4': 1}

In [25]:
for key, value in students_dict.items():
    print(key)
    print(value)
    
for key in students_dict.keys():
    print(key)
    print(students_dict[key])


student1
Shani
student2
Moran
student3
Hadar
student4
1
student1
Shani
student2
Moran
student3
Hadar
student4
1


# Numpy

Numpy is kind of "Vector Programming".<br>
An numpy object is an N-dimensional array, mostly with 1 (vector), 2 (matrix) or 3 (cube) or even more.<br>
It is being used as an input for a lot of ML algorithms in python packages.

### Import the library

In [26]:
import numpy as np

### Basic Numpy objects

In [28]:
vector = np.array([1,2,3,4,5]) #List
matrix = np.array([[1,2,3,4,5], 
                   [6,7,8,9,10],
                   [11,12,13,14,15],
                   [16,17,18,19,20],
                   [21,22,23,24,25]]) #List of lists

print("Vector:\n", vector, "\n")
print("Matrix:\n", matrix, "\n")

Vector:
 [1 2 3 4 5] 

Matrix:
 [[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]
 [21 22 23 24 25]] 



In [46]:
vector1 = np.arange(10)                          #10 serial numbers across one dimension, start from 0
vector2 = np.linspace(1, 10, 19)                 #starts with 1, ends with 10, there are 19 numbers inside
#print(vector1)
#print(vector2)

matrix1 = np.arange(100).reshape(10, 10)         #10X10 (rowXcol) serial numbers across 2 dimension
matrix2 = np.zeros((10,5))                       #10X5 (rowXcol) zeros across 2 dimension

cube1= np.random.random(8).reshape(2, 2, 2)      #2X2X2 uniformed random numbers across 3 dimension
cube2 = np.ones((3, 4, 5)) + 3                   #?

print("matrix1:\n", matrix1, "\n")
print("matrix2:\n", matrix2, "\n")
print("cube1:\n", cube1, "\n")

print("matrix2 shape:\n", matrix2.shape, "\n")
print("vector2 size:\n", vector2.shape, "\n")

print("cube2:\n", cube2, "\n")

matrix1:
 [[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]
 [20 21 22 23 24 25 26 27 28 29]
 [30 31 32 33 34 35 36 37 38 39]
 [40 41 42 43 44 45 46 47 48 49]
 [50 51 52 53 54 55 56 57 58 59]
 [60 61 62 63 64 65 66 67 68 69]
 [70 71 72 73 74 75 76 77 78 79]
 [80 81 82 83 84 85 86 87 88 89]
 [90 91 92 93 94 95 96 97 98 99]] 

matrix2:
 [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]] 

cube1:
 [[[0.82880132 0.56154529]
  [0.66400749 0.0989282 ]]

 [[0.59950996 0.56513079]
  [0.37825268 0.08731116]]] 

matrix2 shape:
 (10, 5) 

vector2 size:
 (19,) 

cube2:
 [[[4. 4. 4. 4. 4.]
  [4. 4. 4. 4. 4.]
  [4. 4. 4. 4. 4.]
  [4. 4. 4. 4. 4.]]

 [[4. 4. 4. 4. 4.]
  [4. 4. 4. 4. 4.]
  [4. 4. 4. 4. 4.]
  [4. 4. 4. 4. 4.]]

 [[4. 4. 4. 4. 4.]
  [4. 4. 4. 4. 4.]
  [4. 4. 4. 4. 4.]
  [4. 4. 4. 4. 4.]]] 



In [44]:
matrix2

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

### Slicing


Vector slicing general format: sliced = [start_idx : end_idx+1 : step]


In [9]:
slice_vector1 = vector[:4]
slice_vector2 = vector[-2:]

print("vector:\n", vector, "\n")
print("slice_vector1:\n", slice_vector1, "\n")
print("slice_vector2:\n", slice_vector2, "\n")


vector:
 [1 2 3 4 5] 

slice_vector1:
 [1 2 3 4] 

slice_vector2:
 [4 5] 



Matrix slicing general format: sliced = [row_start_idx : row_end_idx+1 , col_start_idx : col_end_idx+1]

In [48]:
scalar = matrix[0,0] 
slice_matrix1 = matrix[1:4,2:4] 
slice_matrix2 = matrix[1:4,:] 
slice_matrix3 = matrix[:2,-1:] 
slice_matrix4 = matrix[[0,2,4,], :] 


print("scalar:\n", scalar, "\n")
print("matrix:\n", matrix, "\n")
print("slice_matrix1:\n", slice_matrix1, "\n")
print("slice_matrix2:\n", slice_matrix2, "\n")
print("slice_matrix3:\n", slice_matrix3, "\n")
print("slice_matrix4:\n", slice_matrix4, "\n")

scalar:
 1 

matrix:
 [[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]
 [21 22 23 24 25]] 

slice_matrix1:
 [[ 8  9]
 [13 14]
 [18 19]] 

slice_matrix2:
 [[ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]] 

slice_matrix3:
 [[ 5]
 [10]] 

slice_matrix4:
 [[ 1  2  3  4  5]
 [11 12 13 14 15]
 [21 22 23 24 25]] 



### Operations 

Performing vectorized (element-wise) operations, can be done with same dimension numpy object or with scalar.

In [49]:
a = np.array([20, 30, 40, 50])
b = np.arange(4)

print("a: ", a)
print("b: ", b)

print("b+1: ", b+1)

print("a-b: ", a-b)

print("b^2: ", b**2)

print("10*sin(a): ", 10*np.sin(a))

a:  [20 30 40 50]
b:  [0 1 2 3]
b+1:  [1 2 3 4]
a-b:  [20 29 38 47]
b^2:  [0 1 4 9]
10*sin(a):  [ 9.12945251 -9.88031624  7.4511316  -2.62374854]


#### Aggreagating functions

In [55]:
matrix = np.array([[1,2,3],[4,5,6]])
matrix

array([[1, 2, 3],
       [4, 5, 6]])

In [56]:
print ("\n mean, axis=0", matrix.mean(axis =0)) #calculate mean for each COLUMN

print ("\n sum, axis=1", matrix.sum(axis =1)) #calculate sum for each ROW 

print ("\n std, axis=1", matrix.std(axis =1))

print ("\n min, axis=0", matrix.min(axis =0))
print ("\n min, axis=0", np.min(matrix,axis =0))


 mean, axis=0 [2.5 3.5 4.5]

 sum, axis=1 [ 6 15]

 std, axis=1 [0.81649658 0.81649658]

 min, axis=0 [1 2 3]

 min, axis=0 [1 2 3]


#### Mask
We can make a condition on an numpy array to create a boolean array and use this array to filter or manipulate values in an array based on some criterion. 

In [57]:
print(a)
print("a>35: ", a>35)

new_a = a[a>35]
print("new_a: ", new_a)

[20 30 40 50]
a>35:  [False False  True  True]
new_a:  [40 50]


In [15]:
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8])
print(arr)
print(arr<4)
print("sum of numbers less than 5: ", arr[arr<5].sum())
print("count the numbers less than 5 ", len(arr[arr<5]))

[1 2 3 4 5 6 7 8]
[ True  True  True False False False False False]
sum of numbers less than 5:  10
count the numbers less than 5  4


# Pandas

Pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational” or “labeled” data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, real-world data analysis in Python.

### Pandas objects:
- Series: equivilant to 1-d numpy array
- DataFrame: table-like object, the most important pandas object, equivalent to 2-d numpy array

Each column in a DataFrame is a Series


#### Import the library


In [59]:
import pandas as pd

#### Define DataFrame

In [60]:
df = pd.DataFrame(matrix)

print("matrix:\n", matrix, "\n")
print("DataFrame:")
df

matrix:
 [[1 2 3]
 [4 5 6]] 

DataFrame:


Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


Give names to the columns and rows

In [61]:
df = pd.DataFrame(np.random.randn(6,4), columns=["A","B","C","D"], index = [2,3,4,5,6,7])
df

Unnamed: 0,A,B,C,D
2,-0.137861,-0.891712,0.143232,-0.858072
3,-1.890487,0.830982,-0.771055,-0.03684
4,-1.311937,-0.389424,0.984346,-1.084197
5,0.101341,-0.092147,-0.27262,0.138633
6,-2.127633,0.8538,3.200366,0.255959
7,-1.051659,0.7979,1.143809,0.477599


Each column can be from a different type

In [64]:
df = pd.DataFrame({"Name": ["Braund, Mr. Owen Harris",
                            "Allen, Mr. William Henry",
                            "Bonnell, Miss. Elizabeth"],
                    "Age": [58, 32, 45],
                    "Sex": ["male", "male", "female"]})
print(df.dtypes)
df

Name    object
Age      int64
Sex     object
dtype: object


Unnamed: 0,Name,Age,Sex
0,"Braund, Mr. Owen Harris",58,male
1,"Allen, Mr. William Henry",32,male
2,"Bonnell, Miss. Elizabeth",45,female


Iterate over columns:

In [65]:
df['Name']

0     Braund, Mr. Owen Harris
1    Allen, Mr. William Henry
2    Bonnell, Miss. Elizabeth
Name: Name, dtype: object

In [21]:
for col in df:
    print("col name: " ,col, "\n")
    print( df[col], "\n")

col name:  Name 

0     Braund, Mr. Owen Harris
1    Allen, Mr. William Henry
2    Bonnell, Miss. Elizabeth
Name: Name, dtype: object 

col name:  Age 

0    58
1    32
2    45
Name: Age, dtype: int64 

col name:  Sex 

0      male
1      male
2    female
Name: Sex, dtype: object 



Iterate over rows:

In [66]:
for row in df.index:
    print("row number: ", row, "\n")
    print("row data: \n", df.loc[row], "\n")

row number:  0 

row data: 
 Name    Braund, Mr. Owen Harris
Age                          58
Sex                        male
Name: 0, dtype: object 

row number:  1 

row data: 
 Name    Allen, Mr. William Henry
Age                           32
Sex                         male
Name: 1, dtype: object 

row number:  2 

row data: 
 Name    Bonnell, Miss. Elizabeth
Age                           45
Sex                       female
Name: 2, dtype: object 



Add new column

In [67]:
df["new_col"] = range(5,8)
df

Unnamed: 0,Name,Age,Sex,new_col
0,"Braund, Mr. Owen Harris",58,male,5
1,"Allen, Mr. William Henry",32,male,6
2,"Bonnell, Miss. Elizabeth",45,female,7


In [68]:
df["new_col2"] = df["Age"] + df["new_col"]
df

Unnamed: 0,Name,Age,Sex,new_col,new_col2
0,"Braund, Mr. Owen Harris",58,male,5,63
1,"Allen, Mr. William Henry",32,male,6,38
2,"Bonnell, Miss. Elizabeth",45,female,7,52


Access a specific location by column name and index

In [25]:
df["Name"][2]

'Bonnell, Miss. Elizabeth'

Sorting by an a specific column

In [69]:
df = df.sort_values(by ="Age")
df

Unnamed: 0,Name,Age,Sex,new_col,new_col2
1,"Allen, Mr. William Henry",32,male,6,38
2,"Bonnell, Miss. Elizabeth",45,female,7,52
0,"Braund, Mr. Owen Harris",58,male,5,63


In [70]:
df = df.sort_values(by =["Sex", "Age"])
df

Unnamed: 0,Name,Age,Sex,new_col,new_col2
2,"Bonnell, Miss. Elizabeth",45,female,7,52
1,"Allen, Mr. William Henry",32,male,6,38
0,"Braund, Mr. Owen Harris",58,male,5,63


Delete column or row

In [71]:
df.drop("new_col", axis = 1)

Unnamed: 0,Name,Age,Sex,new_col2
2,"Bonnell, Miss. Elizabeth",45,female,52
1,"Allen, Mr. William Henry",32,male,38
0,"Braund, Mr. Owen Harris",58,male,63


Note that a placement needs to be made in order for it to be saved

In [72]:
df

Unnamed: 0,Name,Age,Sex,new_col,new_col2
2,"Bonnell, Miss. Elizabeth",45,female,7,52
1,"Allen, Mr. William Henry",32,male,6,38
0,"Braund, Mr. Owen Harris",58,male,5,63


In [73]:
df = df.drop(0)
df

Unnamed: 0,Name,Age,Sex,new_col,new_col2
2,"Bonnell, Miss. Elizabeth",45,female,7,52
1,"Allen, Mr. William Henry",32,male,6,38


#### Apply aggregation function on columns

In [74]:
df["Age"].max()

45