In [24]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [78]:
import numpy as np

class MLClassifier:
    def fit(self, x: np.ndarray, y: np.ndarray) -> None:
        '''
        x - numpy array of shape (n, d); n = #observations; d = #variables
        y - numpy array of shape (n,)
        '''
        # no. of variables / dimension
        self.d = x.shape[1]
        
        # no. of classes; assumes labels to be integers from 0 to nclasses-1
        self.nclasses = len(set(y))
        
        # list of means; mu_list[i] is mean vector for label i
        self.mu_list = []
        
        # list of inverse covariance matrices;
        # sigma_list[i] is inverse covariance matrix for label i
        # for efficiency reasons we store only the inverses
        self.sigma_inv_list = []
        
        # list of scalars in front of e^...
        self.scalars = []
        
        n = x.shape[0]
        for i in range(self.nclasses):
            
            # subset of obesrvations for label i
            cls_x = np.array([x[j] for j in range(n) if y[j] == i])
            
            mu = np.mean(cls_x, axis=0)
            
            # rowvar = False, this is to use columns as variables instead of rows
            sigma = np.cov(cls_x, rowvar=False)
            if np.sum(np.linalg.eigvals(sigma) <= 0) != 0:
                # if at least one eigenvalue is <= 0 show warning
                print(f'Warning! Covariance matrix for label {cls} is not positive definite!\n')
            
            sigma_inv = np.linalg.inv(sigma)
            
            scalar = 1/np.sqrt(((2*np.pi)**self.d)*np.linalg.det(sigma))
            
            self.mu_list.append(mu)
            self.sigma_inv_list.append(sigma_inv)
            self.scalars.append(scalar)
            print(mu)
            print(sigma_inv)
            print(scalar)
    
    def _class_likelihood(self, x: np.ndarray, cls: int) -> float:
        '''
        x - numpy array of shape (d,)
        cls - class label
        
        Returns: likelihood of x under the assumption that class label is cls
        '''
        mu = self.mu_list[cls]
        sigma_inv = self.sigma_inv_list[cls]
        scalar = self.scalars[cls]
        d = self.d
        
        exp = (-1/2)*np.dot(np.matmul(x-mu, sigma_inv), x-mu)
        
        return scalar * (np.e**exp)
    
    def predict(self, x: np.ndarray) -> int:
        '''
        x - numpy array of shape (d,)
        Returns: predicted label
        '''
        likelihoods = [self._class_likelihood(x, i) for i in range(self.nclasses)]
        return np.argmax(likelihoods)
    
    def score(self, x: np.ndarray, y: np.ndarray) -> float:
        '''
        x - numpy array of shape (n, d); n = #observations; d = #variables
        y - numpy array of shape (n,)
        Returns: accuracy of predictions
        '''
        n = x.shape[0]
        predicted_y = np.array([self.predict(x[i]) for i in range(n)])
        n_correct = np.sum(predicted_y == y)
        return n_correct/n

In [7]:
os.chdir('/Users/najah/work/internships/meghna/LT05_L1TP_145044_20100428_20161016_01_T1')

In [39]:
df = random_points = pd.read_csv('./145044_20100428_indices/random_points_indices_data.csv')

In [40]:
df['ag_fire'] = np.where(df['class1'] ==5,1,0)

In [41]:
df.columns

Index(['Unnamed: 0', 'id', 'long', 'lat', 'class_x', 'ag_fire', 'class1',
       'geometry', 'b1', 'b2', 'b3', 'b4', 'b5', 'b7', 'ndvi', 'ndmi', 'nbr',
       'ndwi', 'bai', 'baiml', 'baims', 'mirbi', 'gemi', 'class_y',
       'class_label'],
      dtype='object')

In [53]:
df.iloc[:,8:14]

Unnamed: 0,b1,b2,b3,b4,b5,b7
0,0.123110,0.132611,0.182832,0.234003,0.292531,0.225733
1,0.118408,0.127496,0.180216,0.228100,0.303933,0.228462
2,0.132513,0.153071,0.216845,0.242858,0.283028,0.223003
3,0.118408,0.124939,0.169750,0.216293,0.243118,0.190254
4,0.110180,0.117266,0.161901,0.210389,0.246919,0.190254
...,...,...,...,...,...,...
349,0.100776,0.096806,0.127888,0.189727,0.203209,0.149316
350,0.110180,0.119824,0.164517,0.210389,0.226014,0.184795
351,0.106654,0.109594,0.148819,0.204486,0.226014,0.162962
352,0.097250,0.096806,0.127888,0.219244,0.218412,0.149316


In [63]:
df['class1'].value_counts()

11    180
5      76
1      33
13     23
2      19
4      16
8       7
Name: class1, dtype: int64

In [94]:
(x_train, x_test, y_train, y_test) = train_test_split(df.iloc[:, 8:14].values, df.iloc[:, 23].values, train_size=.8, stratify = df.iloc[:, 23])

In [95]:
y_train

array([ 5, 11,  2, 13, 13, 11, 11, 11,  8, 13, 11,  2, 11, 11,  1, 11, 11,
       11,  1, 13,  1,  2, 11,  5, 11, 11, 13,  5,  4,  5, 11, 11,  2,  5,
        5, 11, 11,  5,  5, 11,  5,  5,  5,  1,  5,  5, 11, 11, 13,  1,  5,
        1,  1,  1, 13, 11, 11, 11,  5, 11, 11, 11,  1, 11, 11,  8, 11, 11,
       11, 11,  5, 11, 11,  8, 11,  5, 11, 11,  1, 11,  5, 11, 13,  5, 11,
       11, 11, 11, 11,  1, 11,  2,  5,  5, 11,  5, 11, 11, 11, 11, 11,  5,
       11, 11, 11,  5, 11,  1,  5, 11, 11,  2, 13, 11,  5,  4, 11,  1,  5,
       11,  5, 11,  5, 13,  5,  2,  1, 11, 13,  5, 11, 11,  5,  1, 11,  2,
       11, 11, 11, 11, 13,  5,  4,  4,  1,  4, 11, 11, 11, 11,  5, 13,  1,
       13,  1, 11, 13, 11,  5,  1,  4, 11, 13,  1,  2, 11, 11, 11, 11,  5,
        1, 11, 11,  5, 11, 11, 11,  5, 11,  4, 11, 11, 11,  5, 11, 11, 11,
       11,  2, 11, 13, 11,  4, 11,  5, 11, 11,  5, 11,  5, 11, 11,  5, 11,
        4, 11,  5,  2, 11, 11,  5, 11, 11,  5,  4,  5, 11,  4,  2,  2, 11,
       11, 11, 11, 11, 11

In [83]:

mlc =MLClassifier()

In [96]:
mlc.fit(x_train, y_train)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = um.true_divide(
  sigma = np.cov(cls_x, rowvar=False)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


LinAlgError: 0-dimensional array given. Array must be at least two-dimensional

In [52]:
mlc.score(x_test, y_test)

0.9014084507042254

In [None]:
mlc.