In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd
import time

# Lists

In [2]:
# Creating lists
list_a = [1, 2, 3, 4, 5]
list_b = [6, 7, 8, 9, 10]

In [3]:
# Operations on lists
# Adding lists
list_sum = [a + b for a, b in zip(list_a, list_b)]
print("List Sum:", list_sum)

# Vector product using lists    
vector_product = [a * b for a, b in zip(list_a, list_b)]
print("Vector Product:", vector_product)

List Sum: [7, 9, 11, 13, 15]
Vector Product: [6, 14, 24, 36, 50]


# Numpy Array

In [4]:
# Creating numpy arrays
numpy_array_a = np.array(list_a)
numpy_array_b = np.array(list_b)

In [5]:
# Operations on numpy arrays
# Adding numpy arrays
numpy_sum = numpy_array_a + numpy_array_b
print("Numpy Sum:", numpy_sum)

# Vector product using numpy arrays
numpy_vector_product = np.multiply(numpy_array_a, numpy_array_b)
print("Numpy Vector Product:", numpy_vector_product)

Numpy Sum: [ 7  9 11 13 15]
Numpy Vector Product: [ 6 14 24 36 50]


# Time comparison between list and numpy array

In [6]:
# Creating large arrays and lists for time comparison
numpy_array_a = np.random.randint(0, 100, size=10000)
numpy_array_b = np.random.randint(0, 100, size=10000)

list_a = list(numpy_array_a)
list_b = list(numpy_array_b)

In [7]:
# Time for list addition
start_time = time.time()
for _ in range(1000):
    list_sum = [a + b for a, b in zip(list_a, list_b)]
end_time = time.time()
print("Time taken for lists addition:", end_time - start_time)

# Time for numpy addition
start_time = time.time()
for _ in range(1000):
    numpy_sum = numpy_array_a + numpy_array_b
end_time = time.time()
print("Time taken for numpy addition:", end_time - start_time)

Time taken for lists addition: 0.6483831405639648
Time taken for numpy addition: 0.0


In [8]:
# Time for list vector product
start_time = time.time()
for _ in range(10000):
    list_product = [a * b for a, b in zip(list_a, list_b)]

end_time = time.time()
print("Time taken for list vector product:", end_time - start_time)

# Time for numpy vector product 
start_time = time.time()
for _ in range(10000):
    numpy_product = np.multiply(numpy_array_a, numpy_array_b)

end_time = time.time()
print("Time taken for numpy vector product:", end_time - start_time)

Time taken for list vector product: 6.146199464797974
Time taken for numpy vector product: 0.08333706855773926


# Code clarity

In [9]:
# Numpy code is often more concise and readable than list comprehensions
# Example: Calculate the element-wise product of two lists
list_product = [a * b for a, b in zip(list_a, list_b)]
numpy_product = np.multiply(numpy_array_a, numpy_array_b)

# Reading CSV file using Pandas

In [10]:
# Creating a sample CSV file
csv_data = """
Name,Age,Salary
John, 25, 50000
Alice, 30, 60000
Bob, 28, 55000
Vizz, 35, 65000
Kate, 35, 65000
Alex, 35, 25000
Uma, 25, 90000
"""

In [11]:
with open("../data/sample.csv", "w") as file:
    file.write(csv_data)

In [12]:
# Reading CSV into a Pandas DataFrame
df = pd.read_csv("../data/sample.csv")
df

Unnamed: 0,Name,Age,Salary
0,John,25,50000
1,Alice,30,60000
2,Bob,28,55000
3,Vizz,35,65000
4,Kate,35,65000
5,Alex,35,25000
6,Uma,25,90000


In [13]:
df['Salary']

0    50000
1    60000
2    55000
3    65000
4    65000
5    25000
6    90000
Name: Salary, dtype: int64

# Common stats using Pandas

In [14]:
# Mean
mean_salary = df['Salary'].mean()
print("Mean Salary:", mean_salary)

# Mode
mode_age = df['Age'].mode().values[0]
print("Mode Age:", mode_age)

Mean Salary: 58571.42857142857
Mode Age: 35


# Indexing including binary masks [akin to filter]

In [15]:
# Filtering data based on a condition
filtered_data = df[df['Age'] > 30]
print("Filtered Data:")
print(filtered_data)

Filtered Data:
   Name  Age  Salary
3  Vizz   35   65000
4  Kate   35   65000
5  Alex   35   25000


# Named axis operations

In [16]:
# Adding a new column to the DataFrame
df['Bonus'] = df['Salary'] * 0.1  # Adding a bonus column (10% of salary)
print("DataFrame with Bonus:")
print(df)

DataFrame with Bonus:
    Name  Age  Salary   Bonus
0   John   25   50000  5000.0
1  Alice   30   60000  6000.0
2    Bob   28   55000  5500.0
3   Vizz   35   65000  6500.0
4   Kate   35   65000  6500.0
5   Alex   35   25000  2500.0
6    Uma   25   90000  9000.0


# GroupBy

In [17]:
# Grouping data by Age and calculating mean salary for each group
grouped_data = df.groupby('Age')['Salary'].mean().reset_index()
print("Grouped Data:")
print(grouped_data)

Grouped Data:
   Age        Salary
0   25  70000.000000
1   28  55000.000000
2   30  60000.000000
3   35  51666.666667


# Value Counts

In [18]:
# Counting the occurrences of each age in the DataFrame
age_counts = df['Age'].value_counts()
print("Age Counts:")
print(age_counts)

Age Counts:
Age
35    3
25    2
30    1
28    1
Name: count, dtype: int64
