In [None]:
#Importing all the necessary libraries
import pandas as pd #Pandas library to access and change our csv file
import numpy as np #numpy to perform necessary calculations
import math #math to use pi
from sklearn.model_selection import train_test_split #sklearn train test split to split our data in test and train data
from sklearn.metrics import accuracy_score#to find the accuracy of our model.

#Importing sklearn's naive bayes to check our function by comparing
from sklearn.naive_bayes import GaussianNB

In [None]:
#importing our csv file
df = pd.read_csv('/content/Weather Data.csv')

In [None]:
#Creating our own Gaussian Naive Bayes class. This class is designed to work exactly like sklearn library. We will later compare the results as well.
class GaussianNaiveBayes():
  def __init__(self): #The first function which is automatically run when a new object is created. This will create necessary variables for the class
    print("Gaussian Naive Baye's Object is created")
    self.mean = []
    self.std = []
    self.y = []
    self.predicted = []
  
  def fit(self, x_train, y_train): #fit command to train our model using train data. This will find mean and standard deviation of our data which will later be used in gaussian.
    self.find_mean_and_std(x_train)
    self.y = y_train #saving the y train data to find priori probability later.


  def find_mean_and_std(self,data): #this funcion calculates the mean and std of every column and saves it into our class's vairables.
    mean = []
    std = []
    for col in data.columns:
      mean.append(data[col].mean())
      std.append(data[col].std())
    self.mean = mean
    self.std = std
  
  def predict(self, x_test): #This function will predict the y using given test data. 
    weather_p = []
    self.predicted = []
    i = 0
    while i < int(len(x_test)):
      for weather in self.y.unique():
        inputvalues = x_test.iloc[i] # we will be sending every input value for each weather cell and see which weather has the highest probability. 
        weather_p.append(self.naive_bayes(weather, inputvalues)) #Find posterior probabilities for each weather.
      i = i + 1
      prediction = self.y.loc[weather_p.index(max(weather_p))]
      self.predicted.append(prediction) # The weather with highest probability is stored in predicted array. This array is then returned. 
      continue
    return self.predicted
      
  def naive_bayes(self, y, input):
    #Finding P(y) priori probability
    priori = int(self.y.value_counts()[y])/int(len(self.y))
    #Finding P(x|y)
    likelihood = []
    #Find likelihood probability of each column of features and storing them in likelihood list
    for i in range(len(self.mean)):
      likelihood.append(self.find_likelihood_probability(self.mean[i], self.std[i], input[i]))
    #Finding the posterior probability by implementing the equation as derived in theoretical background.
    likelihood_product = math.prod(likelihood)
    numerator = priori*likelihood_product
    denominator = 0
    for y1 in self.y:
      priori1 = int(self.y.value_counts()[y1])/int(len(self.y))
      denominator = denominator + priori1*likelihood_product
    return (numerator/denominator)

  def find_likelihood_probability(self, m, std, v): #This function will find the likelihood probabilities using gaussian equation.
    expo = np.exp(-((float(v)-float(m))**2)/(2*math.pi*float(std)**2))
    return (expo/(np.sqrt(2*np.pi*float(std)*float(std))))


Separating x and y train

In [None]:
y = df['Weather']#Separating the dependent and independent variables.
x = df.drop(['Weather', 'Date/Time'], axis=1) #Removing unnecessary columns

In [None]:
#In this block we are taking a sample from our data to check the model faster.
x = x.head(300)
y = y.head(300)

In [None]:
#using the train test split command to split our data in xtrain, xtest, ytrain and ytest in a ratio of 80/20 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.20, random_state = 0)

In [None]:
#In this block we will create a model, train it using training data and get a predicted y by inputing test data.
model = GaussianNaiveBayes()
model.fit(x_train, y_train)
predicted_y = model.predict(x_test)

Gaussian Naive Baye's Object is created


In [None]:
#Finding accurary of our prediction
accuracy = accuracy_score(y_test, predicted_y)
accuracy = accuracy*100 #converting to percentage

In [None]:
print(f"{accuracy}%")

13.333333333333334%


In [None]:
#Implementing naive bayes from sklearn library
model2 = GaussianNB()
model2.fit(x_train, y_train)
predictedy2 = model2.predict(x_test)

In [None]:
#Finding accuracy of sklearn's prediction
accuracy2 = accuracy_score(y_test, predictedy2)
accuracy2 = accuracy2*100
print(f"The accuracy of our model is {accuracy} and the accuracy of sklearn's model is {accuracy2}")
#We can see that the accuracy of sklearn's model is higher than ours and it takes less time than our class.

The accuracy of our model is 13.333333333333334 and the accuracy of sklearn's model is 45.0
