# Intro to Pandas

In [1]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install sklearn

Collecting pandas
  Using cached pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
Collecting pytz>=2020.1
  Using cached pytz-2022.7.1-py2.py3-none-any.whl (499 kB)
Collecting numpy>=1.20.3
  Downloading numpy-1.24.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0mm00:01[0m
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.24.2 pandas-1.5.3 pytz-2022.7.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [3]:
# Enable inline plotting
%matplotlib inline

In [4]:
weather = ['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
temp = ['85','80','83','70','68','65','64','72','69','75','75','72','81','87']
humidity = ['85', '90', '86', '96', '80', '70', '65', '95', '70', '80', '70', '90', '75', '91']
windy = [False, True, False, False, False, True, True, False, False, False, True, True, False, True]
play = ['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

In [5]:
weather_dataset = list(zip(weather, temp, humidity, windy, play))
weather_dataset

[('Sunny', '85', '85', False, 'No'),
 ('Sunny', '80', '90', True, 'No'),
 ('Overcast', '83', '86', False, 'Yes'),
 ('Rainy', '70', '96', False, 'Yes'),
 ('Rainy', '68', '80', False, 'Yes'),
 ('Rainy', '65', '70', True, 'No'),
 ('Overcast', '64', '65', True, 'Yes'),
 ('Sunny', '72', '95', False, 'No'),
 ('Sunny', '69', '70', False, 'Yes'),
 ('Rainy', '75', '80', False, 'Yes'),
 ('Sunny', '75', '70', True, 'Yes'),
 ('Overcast', '72', '90', True, 'Yes'),
 ('Overcast', '81', '75', False, 'Yes'),
 ('Rainy', '87', '91', True, 'No')]

In [6]:
# creating a pandas DataFrame
df = pd.DataFrame(data = weather_dataset, columns=['Weather', 'Temperature', 'Humidity', 'Windy', 'Play'])
df

Unnamed: 0,Weather,Temperature,Humidity,Windy,Play
0,Sunny,85,85,False,No
1,Sunny,80,90,True,No
2,Overcast,83,86,False,Yes
3,Rainy,70,96,False,Yes
4,Rainy,68,80,False,Yes
5,Rainy,65,70,True,No
6,Overcast,64,65,True,Yes
7,Sunny,72,95,False,No
8,Sunny,69,70,False,Yes
9,Rainy,75,80,False,Yes


In [None]:
# saving pandas DataFrame
df.to_csv('weather.csv',index=False)

In [None]:
# loading pandas DataFrame
df = pd.read_csv('weather.csv')

# First exploratory data analysis - wine dataset

In [None]:
import sklearn

In [None]:
from sklearn.datasets import load_wine
data = load_wine(as_frame=True)

In [None]:
# The primary two components of pandas are the Series and DataFrame.
# pandas dataframe
df = data.data

In [None]:
# pandas series
# A Series is essentially a column, and a DataFrame is a multi-dimensional table made up of a collection of Series.
data.target

In [None]:
df['score'] = data.target

In [None]:
list(df.columns)

## Basic operations on DataFrames

In [None]:
# Getting column values from a DataFrame
# This will result in a Pandas Series 
alcohol = df['alcohol']
alcohol

In [None]:
# The same operation can result in a Pandas DataFrame
alcohol = df[['alcohol']]
alcohol

In [None]:
# For rows, we have two options:
#   .loc - locates by name
#   .iloc- locates by numerical index
# with iloc we give it the numerical index of DataFrame
row = df.iloc[0]
row

In [None]:
# slicing by rows
df.iloc[1:4]

In [None]:
# conditional selection
df[df.alcohol <= 14]

In [None]:
# More advanced conditional selection
df[(df['alcohol'] <= 14) & (df['alcohol'] >= 13) | (df['score'] == 2)]

In [None]:
# Applying functions
def is_strong(val):
  if val > 12:
    return "strong"
  else:
    return "weak"

df["strong_or_not"] = df["alcohol"].apply(is_strong)
df.head()

### Getting info about the DataFrame

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# Quick histogram
df['alcohol'].plot(kind='hist', title='Alcohol')

In [None]:
df.plot(kind='scatter', x='alcohol', y='score', title='Alcohol to Score');


Additional sources:
https://cloudxlab.com/blog/numpy-pandas-introduction/

https://www.learndatasci.com/tutorials/python-pandas-tutorial-complete-introduction-for-beginners/

https://www.labri.fr/perso/nrougier/from-python-to-numpy/

# Naive Bayes
Naive Bayes is very simple, yet powerful algorithm for classification. It is based on Bayes Theorem with an assumption of independence among predictors. It assumes that the presence of a feature in a class is unrelated to any other feature.

## Bayes Theorem
Bayes theorem describes the probability of an event, based on prior knowledge of conditions that might be related to the event.
Given a Hypothesis (H) and evidence (E), Bayes’ Theorem states that the relationship between the probability of the hypothesis before getting the evidence, P(H), and the probability of the hypothesis after getting the evidence, P(H|E), is:

  P(H|E) = P(E|H)P(H) / P(E)

P(H) is called the prior probability,

P(H|E) is called the posterior probability,

P(H|E)/P(E) is called the likelihood ratio.

P(H) and P(E) are the probabilities of A occurring and B occurring independently of each other.

## Example 1
We’re testing for a rare disease, where we have a highly sensitive and specific test: 99%. Given that a patient tests positive, what is the probability that the patient is actually sick? 

A naive answer would be: Imagine that we have 100 × 100 = 10 000 representative people. Thus 100 would be sick and 9900 healthy. After running our test on them, 99 sick people would 
gest test result as sick and 99 healthy would test sick as well. We’re equally likely to be healthy or sick when the test would result with positive outcome.

Given events x and y, there’s a relationship between the probabilities of either event (denoted P(x) and P(y) ), the joint probabilities (both happen, which is denoted P(x, y) ), and conditional probabilities (event x happens given y happens, denoted P(x|y) )

P(y|x)P(x) = P(x,y) = P(x|y)P(y)

We solve for P(y|x):

P (y|x) = P (x|y)P(y) / P(x) 

P (sick|+) = P (+|sick)P(sick) / P(+) = 0.99 ∗ 0.01 / (0.99 ∗ 0.01 + 0.01 ∗ 0.99 ) = 0.5 = 50%
## Example 2

P(A|B) = P(B|A)P(A) / P(B)

or in other words

P(class|features) = P(features|class)P(class) / P(features)

For example we need to classify a person's sex based on the height and weight. So here the class={male,female} and features={height,weight}, and the formula can he rewritten as

P(sex|height,weight)=P(height,weight|sex)P(sex) / P(height,weight)

Or given the hypothesis : "Age":'<=30', "Income":"medium", "Student":'yes' , "Creadit_Rating":'fair' 

Predict the probability that he will buy or will not buy a computer.

      Age  Income Student Credit_Rating Buys_Computer
      
      <=30   high    no          fair            no
      <=30   high    no      excellent           no
      31-40  high    no          fair           yes
      >40    medium  no          fair           yes
      >40    low     yes         fair           yes
      >40    low     yes     excellent           no
      31-40  low     yes     excellent          yes
      <=30   medium  no          fair            no
      <=30   low     yes         fair           yes
      >40    medium  yes         fair           yes
      <=30   medium  yes     excellent          yes
      31-40  medium  no      excellent          yes
      31-40  high    yes         fair           yes
      >40    medium  no      excellent           no


So:

P(Buyscomputer|Age,Income,Student,Creditrating) = P(Age,Income,Student,Creditrating|Buyscomputer)P(Buyscomputer) / P(Age,Income,Student,Creditrating)

P(Age,Income,Student,Creditrating|Buyscomputer)=P(Age|Buyscomputer)∗P(Income|Buyscomputer)∗P(Student|Buyscomputer)∗P(Creditrating|Buyscomputer)

prior = P(Buys computer)

P(Buys computer ) = How many times (yes/no) appears / Total observations

P(Buys computer = Yes) = 9 / 14 = 0.642857

P(Buys computer = No) = 5 / 14 = 0.357143

Likelihood is generated for each of the features of the dataset. It is a probability of finding each feature given class label.

P(Age='<=30'|Buyscomputer= no) = 3 / 5 = 0.6

P(Age='>40 '|Buyscomputer= no) = 2 / 5 = 0.4

P(Age='<=30'|Buyscomputer= yes) = 2 / 9 = 0.22

P(Age='>40 '|Buyscomputer= yes) = 3 / 9 = 0.33

P(Age='31-40  '|Buyscomputer= yes) = 4 / 9 = 0.44


P(Income=high|Buyscomputer=yes) = 2 / 9 = 0.22

P(Income=medium|Buyscomputer=yes) = 4 / 9 = 0.33

P(Income=low|Buyscomputer=yes) = 3 / 9 = 0.33


P(Income=high|Buyscomputer=no) = 2 / 5 = 0.4

P(Income=medium|Buyscomputer=no) = 1 / 5 = 0.2

P(Income=low|Buyscomputer=no) = 2 / 5 = 0.4



P(Student=yes|Buyscomputer=yes) = 6 / 9 = 0.67

P(Student=no|Buyscomputer=yes) = 3 / 9 = 0.33

P(Student=yes|Buyscomputer=no) = 1 / 5 = 0.2

P(Student=no|Buyscomputer=no) = 4 / 5 = 0.8


P(Creditrating=excellent|Buyscomputer=yes) = 3 / 9 = 0.33

P(Creditrating=faire|Buyscomputer=yes) = 6 / 9 = 0.67

P(Creditrating=excellent|Buyscomputer=no) = 2 / 5 = 0.4

P(Creditrating=faire|Buyscomputer=no) = 3 / 5 = 0.6

So to calculate probability of a ne wperson buying a computer for features "Age":'<=30', "Income":"medium", "Student":'yes' , "Credit_Rating":'fair', 

P(Buyscomputer = yes | Age,Income,Student,Creditrating) = P(Age='<=30'|Buyscomputer=yes) ∗ P(Income=medium|Buyscomputer=yes)∗P(Student=yes|Buyscomputer=yes) ∗ P(Creditrating=fair|Buyscomputer=yes) * P(Buyscomputer=yes) / P(Age='<=30') * P(Income=medium)*P(Student=yes)*P(Creditrating=fair)

P(Buyscomputer = no | Age,Income,Student,Creditrating) = P(Age='<=30'|Buyscomputer=no) ∗ P(Income=medium|Buyscomputer=no)∗P(Student=yes|Buyscomputer=no) ∗ P(Creditrating=fair|Buyscomputer=no) * P(Buyscomputer=no) / P(Age='<=30') * P(Income=medium)*P(Student=yes)*P(Creditrating=fair)

Since we will be comparing probabilities, we can skil the denominator

P(Buyscomputer = yes | Age,Income,Student,Creditrating) = 0.22 * 0.33 * 0.67 * 0.67 * 0.642857 = 0.21

P(Buyscomputer = no | Age,Income,Student,Creditrating) = 0.22 * 0.2 * 0.2 * 0.6 * 0.357143 = 0.0019


So the result is that it is more probable that the person will buy the computer.

## Example 3

In [None]:
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import datasets

In [None]:
# Load new data
data = datasets.load_iris(as_frame=True)

In [None]:
df = data.data
df['target'] = data.target

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
feature_columns = list(df.columns)[:-1]
target_column = list(df.columns)[-1]
feature_columns, target_column

In [None]:
# divide our set into a test and train sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=4)

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
 # initialize a model
 model = GaussianNB()

In [None]:
# train the model
model.fit(train_df[feature_columns] , train_df[target_column])

In [None]:
expected = test_df[target_column]

In [None]:
predicted = model.predict(test_df[feature_columns])

In [None]:
predicted

In [None]:
 print(metrics.classification_report(expected , predicted)) 

In [None]:
print(metrics.confusion_matrix(expected , predicted))

# Task 1
Given the training data in the table below (Tennis data with some numerical attributes), without using sklearn library, predict the class of the following new example using Naïve Bayes classification (write a script to calculate it):
`outlook=overcast, temperature=60, humidity=62, windy=false`

In [None]:
weather_df = pd.read_csv('weather.csv')

In [None]:
weather_df

Additional resources:
https://www.machinelearningplus.com/predictive-modeling/how-naive-bayes-algorithm-works-with-example-and-full-code/

