In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [13]:
df = pd.read_csv('~/Documents/Live_Coding/GDA_Live_coding_FML23/Iris.csv')

In [14]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [19]:
df.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [22]:
df['Species']=df['Species'].map({'Iris-setosa':0,'Iris-versicolor':1,'Iris-virginica':2})


In [23]:
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y= df['Species']

In [25]:
X.shape

(150, 4)

In [29]:
X

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [26]:
y.shape

(150,)

In [27]:
def split_data(x,y, train_size= 0.8):
    # shuffle the data to randomize the train/test split
    m, d = x.shape
    indices = np.random.permutation(m)
    x = x[indices]
    y = y[indices]
    stop = int(m*train_size)
    x_train, y_train = x[:stop], y[:stop]
    x_test, y_test = x[stop:], y[stop:]
    return x_train, x_test, y_train, y_test


In [30]:
X_train, X_test, y_train, y_test= split_data(X.values,y, train_size= 0.8) # split your data into x_train, x_test, y_train, y_test
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(120, 4) (120,) (30, 4) (30,)


In [31]:
def covariance(x, mu):

  # Easy way: cov= np.cov(x, rowvar=0) but do not use it. One can use it to assess his/her result.
  m, d = x.shape
  #mu = mu.reshape(1,-1)
  cov = (1/(m-1))*((x-mu).T@(x-mu))
  return cov

In [32]:
class GDA:
  def __init__(self):
    ## set mu, phi and sigma to None
    self.phi = None
    self.mu = None
    self.sigma = None
    
  def fit(self,x,y):
    k=len(np.unique(y, return_counts=False)) # Number of class.
    d=x.shape[1]  # input dim
    m= x.shape[0] # Number of examples.
    
    ## Initialize mu, phi and sigma
    self.mu= np.zeros((k, d))#: kxd, i.e., each row contains an individual class mu.
    self.sigma= np.zeros((k,d,d))#: kxdxd, i.e., each row contains an individual class sigma.
    self.phi= np.zeros(d)# d-dimension

    ## START THE LEARNING: estimate mu, phi and sigma.

    for i in range(k):
      self.mu[i] = np.mean(x[y == i], axis = 0)
      self.phi[i] = (1/m)*np.sum(y[y == i])
      self.sigma[i] = covariance(x[y == i], self.mu[i]) 


  def predict_proba(self,x):
    '''
      Inputs: x(shape:m,d)
      Output: matrix of probability(shape:m,k)
    '''
    # input dim
    d= x.shape[1] 
    # Number of classes we have in our case it's k = 2
    k_class= self.mu.shape[0] 
    # we define a matrix that will contain our probabilities
    prob = np.zeros((x.shape[0],k_class))
    # Number of examples.
    m = x.shape[0]
    det = []
    inv_sigma = []
 
    for i in range(k_class):
      # we compute the determinant of each class
      det_ = np.linalg.det(self.sigma[i])
      # we compute the inverse of the covariance matrix for each class
      inv_sigma_ = np.linalg.inv(self.sigma[i])
      det.append(det_)
      inv_sigma.append(inv_sigma_)
      const = 1/np.sqrt((2*np.pi)**d*det[i])
      for j in range(m):
        prob[j,i] = const*np.exp(-0.5*(x[j]-self.mu[i]).T@inv_sigma[i]@(x[j] - self.mu[i]))
    return prob

  def predict(self,x):
    prob = self.predict_proba(x)
    y_pred = np.argmax(prob, axis = 1)
    return y_pred

  
  def accuracy(self, y, ypreds):
    acc = (np.mean(y == ypreds))*100
    return acc

In [33]:
model= GDA()
model.fit(X_train,y_train)

In [34]:
ypred = model.predict(X_test)
ypred

array([1, 0, 2, 1, 1, 2, 2, 1, 2, 0, 1, 2, 0, 2, 2, 0, 1, 2, 1, 0, 2, 0,
       1, 2, 2, 2, 1, 1, 0, 0])

In [35]:
model.accuracy(ypred,y_test)

93.33333333333333

In [37]:
X_test

array([[5.6, 2.7, 4.2, 1.3],
       [5.1, 3.8, 1.9, 0.4],
       [6. , 2.2, 5. , 1.5],
       [6.7, 3.1, 4.4, 1.4],
       [7. , 3.2, 4.7, 1.4],
       [6.7, 3. , 5.2, 2.3],
       [6. , 2.7, 5.1, 1.6],
       [6.1, 3. , 4.6, 1.4],
       [6.5, 3. , 5.8, 2.2],
       [5.3, 3.7, 1.5, 0.2],
       [5.4, 3. , 4.5, 1.5],
       [6.7, 3.1, 5.6, 2.4],
       [5.2, 3.5, 1.5, 0.2],
       [6.3, 2.5, 4.9, 1.5],
       [7.2, 3.6, 6.1, 2.5],
       [5.8, 4. , 1.2, 0.2],
       [6.2, 2.9, 4.3, 1.3],
       [6.5, 3.2, 5.1, 2. ],
       [5.1, 2.5, 3. , 1.1],
       [4.9, 3.1, 1.5, 0.1],
       [6.3, 3.4, 5.6, 2.4],
       [5.1, 3.5, 1.4, 0.2],
       [6.2, 2.2, 4.5, 1.5],
       [7.7, 2.6, 6.9, 2.3],
       [6.9, 3.1, 5.1, 2.3],
       [7.7, 3.8, 6.7, 2.2],
       [6.1, 2.8, 4. , 1.3],
       [5.6, 2.9, 3.6, 1.3],
       [4.9, 3.1, 1.5, 0.1],
       [5.7, 4.4, 1.5, 0.4]])

In [48]:
y_test

94     1
44     0
119    2
65     1
50     1
145    2
83     1
91     1
104    2
48     0
84     1
140    2
27     0
72     1
109    2
14     0
97     1
110    2
98     1
37     0
136    2
0      0
68     1
118    2
141    2
117    2
71     1
64     1
34     0
15     0
Name: Species, dtype: int64

In [38]:
input_data = (5.6, 2.7, 4.2, 1.3)
input_data_as_array = np.asarray(input_data)

In [39]:
input_data_reshaped = input_data_as_array.reshape(1,-1)

In [46]:
model.predict(input_data_reshaped)

array([1])

In [36]:
import pickle

In [44]:
file_name = 'my_model.sav'
pickle.dump(model, open(file_name, 'wb'))

In [52]:
#Load the model

model_load = pickle.load(open('my_model.sav', 'rb'))

In [53]:
prediction = model_load.predict(input_data_reshaped)

In [None]:
df['Species']=df['Species'].map({'Iris-setosa':0,'Iris-versicolor':1,'Iris-virginica':2})

In [54]:
if prediction[0] == 0:
    print('The flower is Iris_setosa')
elif prediction[0] == 1:
    print('The flower is Iris-versicolor')
else:
    print('The flower is Iris-virginica')

The flower is Iris-versicolor
