In [2]:
# importing necessary libraries
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Lists

In [3]:
# Creating lists
list_a = [1, 2, 3, 4, 5]
list_b = [6, 7, 8, 9, 10]

In [4]:
# Operations on lists
# Adding lists
list_sum = [a + b for a, b in zip(list_a, list_b)]
print("List Sum:", list_sum)

# Vector product using lists    
vector_product = [a * b for a, b in zip(list_a, list_b)]
print("Vector Product:", vector_product)

List Sum: [7, 9, 11, 13, 15]
Vector Product: [6, 14, 24, 36, 50]


# Numpy Array

In [5]:
# Creating numpy arrays
numpy_array_a = np.array(list_a)
numpy_array_b = np.array(list_b)

In [6]:
# Operations on numpy arrays
# Adding numpy arrays
numpy_sum = numpy_array_a + numpy_array_b
print("Numpy Sum:", numpy_sum)

# Vector product using numpy arrays
numpy_vector_product = np.multiply(numpy_array_a, numpy_array_b)
print("Numpy Vector Product:", numpy_vector_product)

Numpy Sum: [ 7  9 11 13 15]
Numpy Vector Product: [ 6 14 24 36 50]


In [7]:
np.allclose(list_sum, numpy_sum), np.allclose(vector_product, numpy_vector_product)

(True, True)

# Time comparison between list and numpy array

In [8]:
# Creating large arrays and lists for time comparison
numpy_array_a = np.random.randint(0, 100, size=10000)
numpy_array_b = np.random.randint(0, 100, size=10000)

list_a = list(numpy_array_a)
list_b = list(numpy_array_b)

In [9]:
timeit_add_list = %timeit -o [a + b for a, b in zip(list_a, list_b)]

537 µs ± 672 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
timeit_add_numpy = %timeit -o numpy_array_a + numpy_array_b

3.01 µs ± 3.07 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [11]:
# Time for list addition
start_time = time.time()
for _ in range(1000):
    list_sum = [a + b for a, b in zip(list_a, list_b)]
end_time = time.time()
print("Time taken for lists addition:", end_time - start_time)

# Time for numpy addition
start_time = time.time()
for _ in range(1000):
    numpy_sum = numpy_array_a + numpy_array_b
end_time = time.time()
print("Time taken for numpy addition:", end_time - start_time)

Time taken for lists addition: 0.5457706451416016
Time taken for numpy addition: 0.0037784576416015625


In [12]:
# Time for list vector product
start_time = time.time()
for _ in range(10000):
    list_product = [a * b for a, b in zip(list_a, list_b)]

end_time = time.time()
print("Time taken for list vector product:", end_time - start_time)

# Time for numpy vector product 
start_time = time.time()
for _ in range(10000):
    numpy_product = np.multiply(numpy_array_a, numpy_array_b)

end_time = time.time()
print("Time taken for numpy vector product:", end_time - start_time)

Time taken for list vector product: 5.352280139923096
Time taken for numpy vector product: 0.04065513610839844


# Code clarity

In [13]:
# Numpy code is often more concise and readable than list comprehensions
# Example: Calculate the element-wise product of two lists
list_product = [a * b for a, b in zip(list_a, list_b)]
numpy_product = np.multiply(numpy_array_a, numpy_array_b)

# Reading CSV file using Numpy


In [14]:
!head ../datasets/tennis-discrete-output.csv

Day,Outlook,Temp,Humidity,Windy,Play
D1,Sunny,Hot,High,Weak,No
D2,Sunny,Hot,High,Strong,No
D3,Overcast,Hot,High,Weak,Yes
D4,Rain,Mild,High,Weak,Yes
D5,Rain,Cool,Normal,Weak,Yes
D6,Rain,Cool,Normal,Strong,No
D7,Overcast,Cool,Normal,Strong,Yes
D8,Sunny,Mild,High,Weak,No
D9,Sunny,Cool,Normal,Weak,Yes


In [15]:
data = np.genfromtxt('../datasets/tennis-discrete-output.csv', delimiter=',')
data

array([[nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan]])

Wait! What happened?

In [16]:
data = np.genfromtxt('../datasets/tennis-discrete-output.csv', delimiter=',', dtype=str)
data

array([['Day', 'Outlook', 'Temp', 'Humidity', 'Windy', 'Play'],
       ['D1', 'Sunny', 'Hot', 'High', 'Weak', 'No'],
       ['D2', 'Sunny', 'Hot', 'High', 'Strong', 'No'],
       ['D3', 'Overcast', 'Hot', 'High', 'Weak', 'Yes'],
       ['D4', 'Rain', 'Mild', 'High', 'Weak', 'Yes'],
       ['D5', 'Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['D6', 'Rain', 'Cool', 'Normal', 'Strong', 'No'],
       ['D7', 'Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
       ['D8', 'Sunny', 'Mild', 'High', 'Weak', 'No'],
       ['D9', 'Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['D10', 'Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
       ['D11', 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
       ['D12', 'Overcast', 'Mild', 'High', 'Strong', 'Yes'],
       ['D13', 'Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
       ['D14', 'Rain', 'Mild', 'High', 'Strong', 'No']], dtype='<U8')

Question: Find the outlook on D11


In [17]:
idx = np.argwhere(data[:, 0] == 'D11')[0, 0]
idx

11

In [18]:
data[idx]

array(['D11', 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'], dtype='<U8')

In [19]:
data[idx][1]

'Sunny'

# Reading CSV file using Pandas

In [20]:
df = pd.read_csv('../datasets/tennis-discrete-output.csv')

In [21]:
df

Unnamed: 0,Day,Outlook,Temp,Humidity,Windy,Play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Sunny,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


In [22]:
df.query('Day == "D11"')['Outlook']

10    Sunny
Name: Outlook, dtype: object

In [23]:
df.shape

(14, 6)

Question. How many times do we play v/s not play tennis

In [24]:
df['Play'].value_counts()

Play
Yes    9
No     5
Name: count, dtype: int64

In [25]:
df.groupby('Play').size()

Play
No     5
Yes    9
dtype: int64

In [26]:
pd.crosstab(index=df['Play'], columns='count')

col_0,count
Play,Unnamed: 1_level_1
No,5
Yes,9


What is the distribution of any given attribute?

In [27]:
def distribution(df, attribute):
    return df[attribute].value_counts()

In [28]:
distribution(df, 'Outlook')

Outlook
Sunny       5
Rain        5
Overcast    4
Name: count, dtype: int64

In [29]:
distribution(df, 'Temp')

Temp
Mild    6
Hot     4
Cool    4
Name: count, dtype: int64

Finding entropy for target variable

In [30]:
target_attribute = 'Play'
dist_target = distribution(df, target_attribute)

In [31]:
dist_target

Play
Yes    9
No     5
Name: count, dtype: int64

Normalize distribution

In [32]:
dist_target/dist_target.sum()

Play
Yes    0.642857
No     0.357143
Name: count, dtype: float64

In [33]:
df['Play'].value_counts(normalize=True)

Play
Yes    0.642857
No     0.357143
Name: proportion, dtype: float64

In [34]:
normalized_dist_target = dist_target/dist_target.sum()

For loop way of calculating entropy

In [35]:
e = 0.0
for value, p in normalized_dist_target.items():
    e = e - p * np.log2(p + 1e-6) # 1e-6 is added to avoid log(0)
print(e)

0.9402830732836911


In [36]:
normalized_dist_target.apply(lambda x: -x * np.log2(x + 1e-6)).sum()

0.9402830732836911

More on crosstab

In [37]:
pd.crosstab(index=df['Outlook'], columns=df['Play'])

Play,No,Yes
Outlook,Unnamed: 1_level_1,Unnamed: 2_level_1
Overcast,0,4
Rain,2,3
Sunny,3,2


In [38]:
pd.crosstab(index=df['Outlook'], columns=df['Play']).T

Outlook,Overcast,Rain,Sunny
Play,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,0,2,3
Yes,4,3,2


In [39]:
df_attr = pd.crosstab(index=df['Play'], columns=df['Outlook'], normalize='columns')
df_attr

Outlook,Overcast,Rain,Sunny
Play,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,0.0,0.4,0.6
Yes,1.0,0.6,0.4


Using groupby

In [47]:
df.groupby(['Play', 'Outlook']).size()

Play  Outlook 
No    Rain        2
      Sunny       3
Yes   Overcast    4
      Rain        3
      Sunny       2
dtype: int64

In [46]:
df.groupby(['Play', 'Outlook']).size().index

MultiIndex([( 'No',     'Rain'),
            ( 'No',    'Sunny'),
            ('Yes', 'Overcast'),
            ('Yes',     'Rain'),
            ('Yes',    'Sunny')],
           names=['Play', 'Outlook'])

In [49]:
df.groupby(['Play', 'Outlook']).size().unstack('Outlook')

Outlook,Overcast,Rain,Sunny
Play,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,,2.0,3.0
Yes,4.0,3.0,2.0


In [50]:
df_attr_groupby = df.groupby(['Play', 'Outlook']).size().unstack('Outlook').fillna(0)
df_attr_groupby

Outlook,Overcast,Rain,Sunny
Play,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,0.0,2.0,3.0
Yes,4.0,3.0,2.0


Apply

In [109]:
neg_plogp = df_attr.apply(lambda x: -x * np.log2(x + 1e-6), axis=0)
neg_plogp

Outlook,Overcast,Rain,Sunny
Play,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,0.0,0.52877,0.442178
Yes,-1e-06,0.442178,0.52877


In [111]:
neg_plogp.sum(axis=0).sort_index()

Outlook
Overcast   -0.000001
Rain        0.970948
Sunny       0.970948
dtype: float64

In [113]:
df_attr_dist = distribution(df, 'Outlook')
norm_attr_dist = df_attr_dist/df_attr_dist.sum()
norm_attr_dist

Outlook
Sunny       0.357143
Rain        0.357143
Overcast    0.285714
Name: count, dtype: float64

In [115]:
(norm_attr_dist*neg_plogp.sum(axis=0).sort_index()).sum()

0.6935336657070463