# Notebook for Code Exercises and Snippets from Machine Learning with Python Cookbook

Not meant for original work. Just meant for me to write code snippets for muscle memory purposes + notes for understanding

# Chapter 1

In [1]:
import numpy as np 

vector_row = np.array([1, 2, 3])
vector_column = np.array([[1],
                            [2],
                            [3]])

print(vector_row)
print(vector_column)


[1 2 3]
[[1]
 [2]
 [3]]


In [2]:
matrix = np.array([[1, 2],
                    [1, 2],
                    [1, 2]])

print(matrix)

[[1 2]
 [1 2]
 [1 2]]


In [3]:
import scipy
from scipy import sparse

matrix = np.array([[0, 0],
                    [0, 1],
                    [3, 0]])

#creating a compressed sparse row matrix
#compressed sparse row as the matrix only records the indexes of non zero values and just assumes rest of the values are 0's
matrix_sparse = sparse.csr_matrix(matrix)

print(matrix_sparse)

  (1, 1)	1
  (2, 0)	3


In [4]:
vector = np.array([1, 2, 3, 4, 5, 6])

matrix = np.array([[1,2,3],
                    [4,5,6],
                     [7,8,9]])

#selecting second element of the vector
print(vector[2] )

#get matrix value by i, j index
print(matrix[1, 1])

#numpy arrays are zero indexed as well

3
5


In [5]:
matrix = np.array([[1, 2, 3, 4],
                  [5, 6, 7, 8],
                  [9, 10, 11, 12]])

print(matrix.shape)

print(matrix.size)

#how to get the number of dimensions in a matrix
print(matrix.ndim)

(3, 4)
12
2


In [6]:
matrix = np.array([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])
#creating a lambda expression that add's 100 to value
add_100 = lambda x : x + 100

vectorized_add_100 = np.vectorize(add_100)

vectorized_add_100(matrix)

#vectorize allows us to convert functions into ones that apply over all elements in an array or a slice of an array 

array([[101, 102, 103],
       [104, 105, 106],
       [107, 108, 109]])

In [7]:
matrix = np.array([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])

maxi = np.max(matrix)

mini = np.min(matrix)

print("The maximum is {} and the minimum is {}".format(maxi, mini))

The maximum is 9 and the minimum is 1


In [8]:
matrix = np.array([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])

mean = np.mean(matrix)

var = np.var(matrix)

std = np.std(matrix)

print("mean is {} var is {} std is {}".format(mean, var , std))

mean is 5.0 var is 6.666666666666667 std is 2.581988897471611


In [9]:
matrix = np.array([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9],
                  [10, 11, 12]])

reshaped_matrix = matrix.reshape(2, 6)

reshaped_matrix

array([[ 1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12]])

In [10]:
matrix = np.array([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])

transpose = matrix.T

transpose

#transposing is a common operation in linear algebra where the column and row indices of each element are swpaped. which is why you see
# that elements with indices i, i do not end up changing positions in the transpose. Diagonal

array([[1, 4, 7],
       [2, 5, 8],
       [3, 6, 9]])

In [11]:
matrix = np.array([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])

flattened_matrix = matrix.flatten()

#transforms a matrix into one dimensional array
#equivalent to the command matrix.reshape(1, -1)
flattened_matrix

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [12]:
matrix = np.array([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])
                   
rank = np.linalg.matrix_rank(matrix)

#rank of a matrix is the dimensions of the vector space spanned by its columns or rows
                   
print("matrix rank is {}".format(rank))

matrix rank is 2


In [13]:
matrix = np.array([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])

#returns the determinaant of the matrix
det = np.linalg.det(matrix)

In [14]:
matrix = np.array([[1, 2, 3],
                    [2, 4, 6],
                   [3, 8, 9]])

#returns diagonal elements
matrix.diagonal()

array([1, 4, 9])

In [15]:
matrix = np.array([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])
#calculates trace of a matrix defined to be hte sum of the elements on the main diagonal
#can also calculate by doing a sum on the diagonal of hte matrix ie sum(matrix.diagonal())
matrix.trace()

15

In [16]:
matrix = np.array([[1, -1, 3],
                  [1, 1, 6],
                  [3, 8, 9]])

eigenvalues , eigenvectors = np.linalg.eig(matrix)

print(eigenvalues)

print(eigenvectors)

#eigenvectors are vectors tht when a linear transformation by that matrix is applied the vector changes only in scale not in direcion


[13.55075847  0.74003145 -3.29078992]
[[-0.17622017 -0.96677403 -0.53373322]
 [-0.435951    0.2053623  -0.64324848]
 [-0.88254925  0.15223105  0.54896288]]


In [17]:
vector_a = np.array([1, 2, 3])

vector_b = np.array([4, 5, 6])

dot = np.dot(vector_a, vector_b)

print(dot)

#alternative way of doing vector multiplication
dot_py3 = vector_a @ vector_b

print(dot_py3)

32
32


In [18]:
matrix_a = np.array([[1, 1, 1],
                    [1, 1, 1],
                    [1, 1, 2]])

matrix_b = np.array([[1, 3, 1],
                    [1, 3, 1],
                    [1, 3, 8]])

add = np.add(matrix_a, matrix_b)

sub = np.subtract(matrix_a, matrix_b)

print("add is {} \n sub is {}".format(add, sub))

#can also do matrix_a + matrix_b

add is [[ 2  4  2]
 [ 2  4  2]
 [ 2  4 10]] 
 sub is [[ 0 -2  0]
 [ 0 -2  0]
 [ 0 -2 -6]]


In [19]:
matrix_a = np.array([[1, 1],
                    [1, 2]])

matrix_b = np.array([[1, 3],
                    [1, 2]])

mult = np.dot(matrix_a, matrix_b)
mult_alt = matrix_a @ matrix_b
mult_element_wise = matrix_a * matrix_b

print ("mult is {} \n mult_alt is {} \n mult_element_wise is {}\n".format(mult, mult_alt, mult_element_wise))

mult is [[2 5]
 [3 7]] 
 mult_alt is [[2 5]
 [3 7]] 
 mult_element_wise is [[1 3]
 [1 4]]



In [20]:
matrix = np.array([[1, 4],
                  [2, 5]])

inv = np.linalg.inv(matrix)

print(inv)

[[-1.66666667  1.33333333]
 [ 0.66666667 -0.33333333]]


In [21]:
np.random.seed(0)

#generates 3 random float values between 0 and 1
np.random.random(3)

#generates 3 random ints between 1 and 10
np.random.randint(0, 11, 3)

#draws 3 random numbers from a normal distribution with mean 0 and std deviation 1.0
np.random.normal(0.0, 1.0, 3)

#draws 3 numbers from a logistic distribution with mean 0 and scale 1
np.random.logistic(0.0, 1.0, 3)

#draws 3 numbers greater than or equal to 1.0 and less than 2.0
np.random.uniform(1.0, 2.0, 3)

array([1.47997717, 1.3927848 , 1.83607876])

# Chapter 2
## Loading Data

In [22]:
#Loading preexisting sample dataset
from sklearn import datasets

digits = datasets.load_digits()

features = digits.data

targets = digits.target

features[0]

#other examples of good loadable sets
#load_boston()
#load_iris()
#load_digits()


array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [23]:
#Creationg simulated data. 3 useful methods

#simjulated data with linear regression

from sklearn.datasets import make_regression

#returns a feature float matrix and target vector of float values and coefficient matrix
features, target, coefficients = make_regression(n_samples = 100,
                                                n_features = 3, #num features
                                                n_informative = 3, #num of informative (features used to build the model) features used to build the model
                                                n_targets = 1, #dimension of label / target
                                                noise = 0.0, #standard deviation of gaussian noise applied to taget
                                                coef = True, #if coef is set to true the coefficients of the underlying model are returned
                                                random_state = 1)

#simulated data for classification
from sklearn.datasets import make_classification

#returns a feature float matrix but a target vector of integers
features, target = make_classification(n_samples = 100,
                                      n_features = 3,
                                      n_informative = 3, 
                                      n_redundant = 0, #number of redundant features aka features that are generated as linear combinations of the other features 
                                      n_classes = 2, #num classes/ labels 
                                      weights = [.25, .75], #proportion of samples attached to each class
                                      random_state = 1)  #reproducible output

#if n_informative is less than the total number of features than the resulting data set will have redundant features that can be
#identified through feature selection technique

#weights parameter allows us to simulate datasets with imbalanced classes

from sklearn.datasets import make_blobs

#returns a features float matrix but a target vector of integers
features, target = make_blobs(n_samples = 100,
                              n_features = 2,
                              centers = 3, #number of centroids / fixed center lcoations
                              cluster_std = 0.5, #std dev of one cluster  
                              shuffle = True,
                              random_state = 1)

In [24]:
import pandas as pd

url = 'http://winterolympicsmedals.com/medals.csv'

dataframe = pd.read_csv(url)

dataframe.head(2)

#in this block there are two ways to read a csv from url

#import csv
#import urllib.request

#response = urllib.request.urlopen(url)
#lines = [l.decode('utf-8') for l in response.readlines()]
#cr = csv.reader(lines)

#for row in cr:
    #print(row)
    #break

Unnamed: 0,Year,City,Sport,Discipline,NOC,Event,Event gender,Medal
0,1924,Chamonix,Skating,Figure skating,AUT,individual,M,Silver
1,1924,Chamonix,Skating,Figure skating,AUT,individual,W,Gold


In [25]:
#url = 'https://fake_url.com/simulated_excel'

#df = pd.read_excel(url, sheetname=0, header=1)
#sheetname 0 means get first sheetname, set it to none for all, header = 1 means index of row for header

In [26]:
#url = 'https://tinyurl.com/simulated_json'

#df = pd.read_json(url, orient = 'columns')

#orient = column defines the json format which will be {column -> {index -> val}}

In [27]:
#from sqlalchemy import create_engine

#create connection to db
#database_connection = create_engine('sqlite:///sample.db')

#an example on how to set up an sql connection and then query the data base via pandas
#dataframe = pd.read_sql_query('SELECT * FROM data', database_connection)

#dataframe.head(2)

# Chapter 3
## Data Wrangling

In [28]:
url = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv'

df = pd.read_csv(url)
#each row is an observation
#each column is a feature
#each column contains a name to select it with 
#each row contains an index number

df.head(5)

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [29]:
df = pd.DataFrame()

df['Name'] = ['Nikhilesh', 'Kashyap']
df['age'] = [25, 26]
df['Driver'] = [True, False]

df

Unnamed: 0,Name,age,Driver
0,Nikhilesh,25,True
1,Kashyap,26,False


In [30]:
df = pd.read_csv(url)

df.head(2)

print(df.describe())

print(df.shape)

         survived         age  n_siblings_spouses       parch        fare
count  627.000000  627.000000          627.000000  627.000000  627.000000
mean     0.387560   29.631308            0.545455    0.379585   34.385399
std      0.487582   12.511818            1.151090    0.792999   54.597730
min      0.000000    0.750000            0.000000    0.000000    0.000000
25%      0.000000   23.000000            0.000000    0.000000    7.895800
50%      0.000000   28.000000            0.000000    0.000000   15.045800
75%      1.000000   35.000000            1.000000    0.000000   31.387500
max      1.000000   80.000000            8.000000    5.000000  512.329200
(627, 10)


In [36]:
#df = pd.read_csv(url)
print(df.iloc[0]) #selecting first row
print('\n')
print(df.iloc[1: 4]) #selecting second, third,fourthrow,its inclusive : exclusive)
df = df.set_index(df['sex'])
print('\n')
print(df.loc['female'])

#loc is a useful function when the index of the data frame is a lebl
#iloc is useful when looking for the position in the dataframe 

survived                        0
sex                          male
age                            22
n_siblings_spouses              1
parch                           0
fare                         7.25
class                       Third
deck                      unknown
embark_town           Southampton
alone                           n
Name: 0, dtype: object


   survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   

   embark_town alone  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  


        survived     sex   age  n_siblings_spouses  parch     fare   class  \
sex                                                                          
female         1  female  38.0                   1      0  7

In [None]:
df.columns

In [None]:
df = pd.read_csv(url)

df[df['sex'] == 'female'].head(2)

df[(df['sex'] == 'female') & (df['age'] >= 50)]

#how to conditionally filter on a dataframe based on an expression

In [None]:
#Uuse panda.sreplace method to replace values easily in columns

df = pd.read_csv(url)

#replace single value
df['sex'].replace("female", "Woman").head(2)

#replacing contents of first array by contents of second
df['sex'].replace(["female", "male"], ["man", "woman"]).head(2)

In [None]:
df = pd.read_csv(url)
df.rename(columns={'class' : 'Passenger Class'}).head(2)

df = pd.read_csv(url)
df.rename(columns={'class' : 'Passenger Class', 'sex' : 'Gender'}).head(2)
# can also rename by passing in a dictgionary to change multiple column names at once

In [None]:
#descriptive statistics
print("Max is {}".format(df['age'].max()))
print("Min is {}".format(df['age'].min()))
print("Mean is {}".format(df['age'].mean()))
print("Sum is {}".format(df['age'].sum()))
print("Count is {}".format(df['age'].count()))

#other descriptive statistics offered are .var(), .std() (standard ddeviation), .kurt() (kurtosis), .skew(), .sem() standard error of the mean,
#.mode(), .median(), and a bunch of others

In [None]:
df['sex'].unique()

In [None]:
df = pd.read_csv(url)
print("Unique count is {}".format(df['sex'].unique()))
print("\n Unique value counts are {}".format(df['sex'].value_counts()))

In [None]:
df = pd.read_csv(url)
df[df['age'].isnull()].head(2) #selects for all observations indexed by age where age is null

#to replace values with nan need to do import numpy as np then do
#df['Sex'] = df['Sex'].replace('male', np.nan)

#can load in a dataset while defining na values already
#df = pd.read_csv(url, na_values=[np.nan, 'NONE', -999])

In [None]:
df = pd.read_csv(url)

#dataframe.drop('Age', axis=1).head(2)
#dataframe.drop(['Age', 'Sex'], axis=1).head(2)
#dataframe.drop(dataframe.columns[1], axis=1).head(2) #used when column doesnt have a name we can drop it by its index in the columns array

In [None]:
df = pd.read_csv(url)

df[df['sex'] != 'male'].head(2) #use conditional expression to delete rows
#can also use a conditional to delete a single row by matchihnga  unique value

#df[df['fare'] != 71.2833]

#can also delete by row index
#df[df.index != 0].head(2)

In [44]:
df = pd.read_csv(url)

df.drop_duplicates().head(15)

print("Number of rows in original {}".format(len(df)))
print("Number of rows in dupe dropped {}".format(len(df.drop_duplicates())))

#only drops duplicates by dropping rows that match perfectly across all columns

print("Number of rows in dupe dropped by a more specific condition {}".format(len(df.drop_duplicates(subset=['sex']))))

#drop duplicates defaults ot keeping the first occurrence of a duplicated row and dropping the last
#could also add parameter keep and set it = to 'last' in order to keep the last occurrence
df.drop_duplicates(subset=['sex'], keep='last')

#there is also a duplicated method which returns a boolean series denoting if each row is a duplicate or not

Number of rows in original 627
Number of rows in dupe dropped 558
Number of rows in dupe dropped by a more specific condition 2


Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
625,0,female,28.0,1,2,23.45,Third,unknown,Southampton,n
626,0,male,32.0,0,0,7.75,Third,unknown,Queenstown,y


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 627 entries, 0 to 626
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   survived            627 non-null    int64  
 1   sex                 627 non-null    object 
 2   age                 627 non-null    float64
 3   n_siblings_spouses  627 non-null    int64  
 4   parch               627 non-null    int64  
 5   fare                627 non-null    float64
 6   class               627 non-null    object 
 7   deck                627 non-null    object 
 8   embark_town         627 non-null    object 
 9   alone               627 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 49.1+ KB


In [50]:
#group by
#The group by function needs to be paired with some sort of operation that we want to apply to each group, such as calculating an aggregate statistic such as 
#mean, median, sum 

#this will retrun a group by data farme object
print(df.groupby('age'))

#i am grouping by survived and then getting the mean age for each group
df.groupby('survived')['age'].mean()


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001B5EDEBD670>


survived
0    29.960938
1    29.110412
Name: age, dtype: float64

In [51]:
#if my goal is to group individual rows by time periods I need to use the resample function

date_index = pd.date_range('06/16/2017', periods=100000, freq='30S')

df = pd.DataFrame(index = date_index)

df['Sale_Amount'] = np.random.randint(1, 10, 100000)

#What this function does is groupby by the date range chosen (week) and then sums the feature data with it
#grouping the rows by a wide array of time periods (offsets) and then calculated a sum statistic on each time group
df.resample('W').sum()

Unnamed: 0,Sale_Amount
2017-06-18,43312
2017-06-25,100457
2017-07-02,101117
2017-07-09,100315
2017-07-16,100866
2017-07-23,53967


In [54]:
df.resample('2W').mean()

Unnamed: 0,Sale_Amount
2017-06-18,5.012963
2017-07-02,4.999355
2017-07-16,4.989608
2017-07-30,5.034235


In [55]:
df.resample('M').mean()

Unnamed: 0,Sale_Amount
2017-06-30,4.997755
2017-07-31,5.002306


In [56]:
df.resample('M').count() #by default is returning the right edge of the time group aka the last date in the time group

Unnamed: 0,Sale_Amount
2017-06-30,43200
2017-07-31,56800


In [59]:
df.resample('M', label='left').count() #using the label left parameter to return the left edge of the time group aka the first date in the
#time group as the label

Unnamed: 0,Sale_Amount
2017-05-31,43200
2017-06-30,56800


In [63]:
df.columns

Index(['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare',
       'class', 'deck', 'embark_town', 'alone'],
      dtype='object')

In [69]:
#iterating over every element in a column and applying an action. just quick practice for slicing
df = pd.read_csv(url)

for x in df['fare'].iloc[0:2]:
    print(x)

print('\n')
    
[print(x) for x in df['fare'].iloc[0:2]]

print('\n')

[print(x * 100) for x in df['fare'][:3]]

7.25
71.2833


7.25
71.2833


725.0
7128.33
792.5


[None, None, None]

In [72]:
#Applying a function over all elements in a column 

df = pd.read_csv(url)

def mult100(x):
    return x * 100

df['fare'].apply(mult100)[:100]

#apply is a common way to write a function to performa  useful operation and mapping it to every element in a column

0      725.00
1     7128.33
2      792.50
3     5310.00
4      845.83
       ...   
95    3007.08
96    1300.00
97     775.00
98     714.17
99     697.50
Name: fare, Length: 100, dtype: float64

In [75]:
#combining group by and apply

df = pd.read_csv(url)

df.groupby('sex').apply(lambda x: x.mean())

Unnamed: 0_level_0,survived,age,n_siblings_spouses,parch,fare
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0.778802,28.652074,0.654378,0.62212,47.118741
male,0.180488,30.149585,0.487805,0.25122,27.646044


In [88]:
#concatenating two dataframes 
#going to use concat with axis = 0 to concatenate along the row axis
#axis = 1 means columns
#axis = 0 usually means row

df_a = {
    'id' : ['1', '2', '3'],
    'first' : ['Alex', 'Amy', 'Allen'],
    'last' : ['Handerson', 'Hackerman', 'Hali']
}

df_a = pd.DataFrame(df_a, columns = ['id', 'first', 'last'])

df_b = {
    'id' : ['4', '5', '6'],
    'first' : ['Billy', 'Brian', 'Bran'],
    'last' : ['Bonder', 'Black', 'Balwner']
}

df_b = pd.DataFrame(df_b, columns = ['id', 'first', 'last'])

concated_df = pd.concat([df_a, df_b], axis = 0)


#concatenating by the columns
#concated_df_columns = pd.concat([df_a, df_b], axis=1)

In [89]:
#appending new row to a dataframe
row = pd.Series([10, 'Chris', 'Chillon'], index=['id', 'first', 'last'])
df_a.append(row, ignore_index=True)

Unnamed: 0,id,first,last
0,1,Alex,Handerson
1,2,Amy,Hackerman
2,3,Allen,Hali
3,10,Chris,Chillon


In [94]:
#Merging two data frames

#inner join. merge defaults to inner joins.

employee_data = {'employee_id' : ['1', '2', '3', '4'],
                'Name' : ['Amy Jones', 'Allen Keys', 'Alice Bees', 'Tim Horton']}

df_emp = pd.DataFrame(employee_data, columns=['employee_id', 'Name'])

sales_data = {'employee_id' : ['3', '4', '5', '6'],
             'total_sales' : [23456, 2512, 2345, 1455]}

df_sales = pd.DataFrame(sales_data, columns=['employee_id', 'total_sales'])

#defaults to inner join
pd.merge(df_emp, df_sales, on='employee_id')

Unnamed: 0,employee_id,Name,total_sales
0,3,Alice Bees,23456
1,4,Tim Horton,2512


In [95]:
#specifiying outer with how parameter. is a union of a join
pd.merge(df_emp, df_sales, on='employee_id', how='outer')

Unnamed: 0,employee_id,Name,total_sales
0,1,Amy Jones,
1,2,Allen Keys,
2,3,Alice Bees,23456.0
3,4,Tim Horton,2512.0
4,5,,2345.0
5,6,,1455.0


In [96]:
#left join
pd.merge(df_emp, df_sales, on='employee_id', how='left')

Unnamed: 0,employee_id,Name,total_sales
0,1,Amy Jones,
1,2,Allen Keys,
2,3,Alice Bees,23456.0
3,4,Tim Horton,2512.0


In [97]:
#specifying columsn to join on 
pd.merge(df_emp, df_sales, left_on='employee_id', right_on='employee_id')

Unnamed: 0,employee_id,Name,total_sales
0,3,Alice Bees,23456
1,4,Tim Horton,2512


In [None]:
#pd.merge(df_emp, df_sales, left_index=True, right_index=True)
#an example of how to do a merge but by specifying the indices to do the join on

In [None]:
#3 aspects to specify any merge operation:
#1.) dataframes to merge
#2.) name of columns to merge on
#3.) type of merge to conduct
    #specified by how parameter
    #four types of main joins
        #Inner ( returns only rows that match in both frames)
        #outer (returns all rows in both frames. if a row exists in one but not in the other fill in nan for missing values. Union)
        #left: returns all rows from left data frame but only rows from the right that matched with the left data frame. fill in nan for missing
        #right: retrurns all rows from right data fram ebut only rows from left that mathc with right. fill in nan for missing vlaues

# Chapter 4: 
## Handling Numerical Data