In [14]:
# importing necessary libraries
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Lists

In [15]:
# Creating lists
list_a = [1, 2, 3, 4, 5]
list_b = [6, 7, 8, 9, 10]

In [16]:
# Operations on lists
# Adding lists
list_sum = [a + b for a, b in zip(list_a, list_b)]
print("List Sum:", list_sum)

# Vector product using lists    
vector_product = [a * b for a, b in zip(list_a, list_b)]
print("Vector Product:", vector_product)

List Sum: [7, 9, 11, 13, 15]
Vector Product: [6, 14, 24, 36, 50]


# Numpy Array

In [17]:
# Creating numpy arrays
numpy_array_a = np.array(list_a)
numpy_array_b = np.array(list_b)

In [18]:
# Operations on numpy arrays
# Adding numpy arrays
numpy_sum = numpy_array_a + numpy_array_b
print("Numpy Sum:", numpy_sum)

# Vector product using numpy arrays
numpy_vector_product = np.multiply(numpy_array_a, numpy_array_b)
print("Numpy Vector Product:", numpy_vector_product)

Numpy Sum: [ 7  9 11 13 15]
Numpy Vector Product: [ 6 14 24 36 50]


In [19]:
np.allclose(list_sum, numpy_sum), np.allclose(vector_product, numpy_vector_product)

(True, True)

# Time comparison between list and numpy array

In [20]:
# Creating large arrays and lists for time comparison
numpy_array_a = np.random.randint(0, 100, size=10000)
numpy_array_b = np.random.randint(0, 100, size=10000)

list_a = list(numpy_array_a)
list_b = list(numpy_array_b)

In [21]:
timeit_add_list = %timeit -o [a + b for a, b in zip(list_a, list_b)]

605 µs ± 2.46 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [22]:
timeit_add_numpy = %timeit -o numpy_array_a + numpy_array_b

3.14 µs ± 3.31 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [7]:
# Time for list addition
start_time = time.time()
for _ in range(1000):
    list_sum = [a + b for a, b in zip(list_a, list_b)]
end_time = time.time()
print("Time taken for lists addition:", end_time - start_time)

# Time for numpy addition
start_time = time.time()
for _ in range(1000):
    numpy_sum = numpy_array_a + numpy_array_b
end_time = time.time()
print("Time taken for numpy addition:", end_time - start_time)

Time taken for lists addition: 0.6483831405639648
Time taken for numpy addition: 0.0


In [8]:
# Time for list vector product
start_time = time.time()
for _ in range(10000):
    list_product = [a * b for a, b in zip(list_a, list_b)]

end_time = time.time()
print("Time taken for list vector product:", end_time - start_time)

# Time for numpy vector product 
start_time = time.time()
for _ in range(10000):
    numpy_product = np.multiply(numpy_array_a, numpy_array_b)

end_time = time.time()
print("Time taken for numpy vector product:", end_time - start_time)

Time taken for list vector product: 6.146199464797974
Time taken for numpy vector product: 0.08333706855773926


# Code clarity

In [9]:
# Numpy code is often more concise and readable than list comprehensions
# Example: Calculate the element-wise product of two lists
list_product = [a * b for a, b in zip(list_a, list_b)]
numpy_product = np.multiply(numpy_array_a, numpy_array_b)

# Reading CSV file using Numpy


In [34]:
!head ../datasets/tennis-discrete-output.csv

Day,Outlook,Temp,Humidity,Windy,Play
D1,Sunny,Hot,High,Weak,No
D2,Sunny,Hot,High,Strong,No
D3,Overcast,Hot,High,Weak,Yes
D4,Rain,Mild,High,Weak,Yes
D5,Rain,Cool,Normal,Weak,Yes
D6,Rain,Cool,Normal,Strong,No
D7,Overcast,Cool,Normal,Strong,Yes
D8,Sunny,Mild,High,Weak,No
D9,Sunny,Cool,Normal,Weak,Yes


In [35]:
data = np.genfromtxt('../datasets/tennis-discrete-output.csv', delimiter=',')
data

array([[nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan]])

Wait! What happened?

In [38]:
data = np.genfromtxt('../datasets/tennis-discrete-output.csv', delimiter=',', dtype=str)
data

array([['Day', 'Outlook', 'Temp', 'Humidity', 'Windy', 'Play'],
       ['D1', 'Sunny', 'Hot', 'High', 'Weak', 'No'],
       ['D2', 'Sunny', 'Hot', 'High', 'Strong', 'No'],
       ['D3', 'Overcast', 'Hot', 'High', 'Weak', 'Yes'],
       ['D4', 'Rain', 'Mild', 'High', 'Weak', 'Yes'],
       ['D5', 'Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['D6', 'Rain', 'Cool', 'Normal', 'Strong', 'No'],
       ['D7', 'Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
       ['D8', 'Sunny', 'Mild', 'High', 'Weak', 'No'],
       ['D9', 'Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['D10', 'Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
       ['D11', 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
       ['D12', 'Overcast', 'Mild', 'High', 'Strong', 'Yes'],
       ['D13', 'Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
       ['D14', 'Rain', 'Mild', 'High', 'Strong', 'No']], dtype='<U8')

Question: Find the outlook on D11


In [46]:
idx = np.argwhere(data[:, 0] == 'D11')[0, 0]
idx

11

In [47]:
data[idx]

array(['D11', 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'], dtype='<U8')

In [48]:
data[idx][1]

'Sunny'

# Reading CSV file using Pandas

In [50]:
df = pd.read_csv('../datasets/tennis-discrete-output.csv')

In [51]:
df

Unnamed: 0,Day,Outlook,Temp,Humidity,Windy,Play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Sunny,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


In [52]:
df.query('Day == "D11"')['Outlook']

10    Sunny
Name: Outlook, dtype: object

In [53]:
df.shape

(14, 6)

Question. How many times do we play v/s not play tennis

In [56]:
df['Play'].value_counts()

Play
Yes    9
No     5
Name: count, dtype: int64

In [57]:
df.groupby('Play').size()

Play
No     5
Yes    9
dtype: int64

In [61]:
pd.crosstab(index=df['Play'], columns='count')

col_0,count
Play,Unnamed: 1_level_1
No,5
Yes,9


What is the distribution of any given attribute?

In [64]:
def distribution(df, attribute):
    return df[attribute].value_counts()

In [65]:
distribution(df, 'Outlook')

Outlook
Sunny       5
Rain        5
Overcast    4
Name: count, dtype: int64

In [67]:
distribution(df, 'Temp')

Temp
Mild    6
Hot     4
Cool    4
Name: count, dtype: int64

More on crosstab

In [69]:
pd.crosstab?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mcrosstab[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvalues[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrownames[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolnames[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maggfunc[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmargins[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmargins_name[0m[0;34m:[0m [0;34m'Hashable'[0m [0;34m=[0m [0;34m'All'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdropna[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnormalize[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)

In [76]:
pd.crosstab(index=df['Outlook'], columns=df['Play'])

Play,No,Yes
Outlook,Unnamed: 1_level_1,Unnamed: 2_level_1
Overcast,0,4
Rain,2,3
Sunny,3,2


In [78]:
pd.crosstab(index=df['Outlook'], columns=df['Play']).T

Outlook,Overcast,Rain,Sunny
Play,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,0,2,3
Yes,4,3,2


In [84]:
df_attr = pd.crosstab(index=df['Play'], columns=df['Outlook'], normalize='columns')
df_attr

Outlook,Overcast,Rain,Sunny
Play,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,0.0,0.4,0.6
Yes,1.0,0.6,0.4


In [89]:
df_attr["Entropy_Overcast"] = df_attr['Overcast'].apply(lambda x: -x * np.log2(x+1e-16))
df_attr

Outlook,Overcast,Rain,Sunny,Entropy_Overcast
Play,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.0,0.4,0.6,0.0
Yes,1.0,0.6,0.4,-0.0


In [85]:
df_attr_dist = distribution(df, 'Outlook')
df_attr_dist

Outlook
Sunny       5
Rain        5
Overcast    4
Name: count, dtype: int64

In [10]:
# Creating a sample CSV file
csv_data = """
Name,Age,Salary
John, 25, 50000
Alice, 30, 60000
Bob, 28, 55000
Vizz, 35, 65000
Kate, 35, 65000
Alex, 35, 25000
Uma, 25, 90000
"""

In [11]:
with open("../data/sample.csv", "w") as file:
    file.write(csv_data)

In [12]:
# Reading CSV into a Pandas DataFrame
df = pd.read_csv("../data/sample.csv")
df

Unnamed: 0,Name,Age,Salary
0,John,25,50000
1,Alice,30,60000
2,Bob,28,55000
3,Vizz,35,65000
4,Kate,35,65000
5,Alex,35,25000
6,Uma,25,90000


In [13]:
df['Salary']

0    50000
1    60000
2    55000
3    65000
4    65000
5    25000
6    90000
Name: Salary, dtype: int64

# Common stats using Pandas

In [14]:
# Mean
mean_salary = df['Salary'].mean()
print("Mean Salary:", mean_salary)

# Mode
mode_age = df['Age'].mode().values[0]
print("Mode Age:", mode_age)

Mean Salary: 58571.42857142857
Mode Age: 35


# Indexing including binary masks [akin to filter]

In [15]:
# Filtering data based on a condition
filtered_data = df[df['Age'] > 30]
print("Filtered Data:")
print(filtered_data)

Filtered Data:
   Name  Age  Salary
3  Vizz   35   65000
4  Kate   35   65000
5  Alex   35   25000


# Named axis operations

In [16]:
# Adding a new column to the DataFrame
df['Bonus'] = df['Salary'] * 0.1  # Adding a bonus column (10% of salary)
print("DataFrame with Bonus:")
print(df)

DataFrame with Bonus:
    Name  Age  Salary   Bonus
0   John   25   50000  5000.0
1  Alice   30   60000  6000.0
2    Bob   28   55000  5500.0
3   Vizz   35   65000  6500.0
4   Kate   35   65000  6500.0
5   Alex   35   25000  2500.0
6    Uma   25   90000  9000.0


# GroupBy

In [17]:
# Grouping data by Age and calculating mean salary for each group
grouped_data = df.groupby('Age')['Salary'].mean().reset_index()
print("Grouped Data:")
print(grouped_data)

Grouped Data:
   Age        Salary
0   25  70000.000000
1   28  55000.000000
2   30  60000.000000
3   35  51666.666667


# Value Counts

In [18]:
# Counting the occurrences of each age in the DataFrame
age_counts = df['Age'].value_counts()
print("Age Counts:")
print(age_counts)

Age Counts:
Age
35    3
25    2
30    1
28    1
Name: count, dtype: int64
