Some typical NumPy 'random' functions:

In [None]:
import numpy as np

# To generate a random float values between 0 and 1 (exclusive)
print(np.random.rand(3, 2)) # Creates a 3 x 2 matrix of random float values between 0 and 1 (exclusive)

# To generate random float values from standard normal distribution (mean = 0 and std = 1)
print(np.random.randn(5)) # Generates 5 random float values from standard normal distribution

# To generate random integer values within a range of values
print(np.random.randint(1, 100, 10)) # Generates 10 random integer values between 1 and 100 (exclusive)

# To randomly select an element from a given list of elements
print(np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9], size=3)) # Three random numbers from the given array will be chosen

# To generate a random sample of values from a normally distributed with a specified mean and standard deviation
print(np.random.normal(loc=0, scale=1, size=10)) # 10 random values will be generated from a normally distributed data where 0 (loc) is the mean and 1 (scale) is the standard deviation

# To set a seed for a random number generation to ensure you get the same results later as well
print(np.random.seed(42)) # This ensures that any random numbers generated in the program are same every time you run the program. The seed value '42' can be any other value

AIM #1: Generate a very large dataset
1. Generate a dataset of 1 million random data items between 1 and 100 items using only pandas
2. Generate a dataset of 1 million random data items between 1 and 100 using only NumPy
3. Calculate the time it takes for both the above operations. 
    3.1. Import the 'time' module, and use the time() function to calculate current time
    3.2. Which one is faster and why?

In [None]:
1.
import pandas as pd

# Generate a range of numbers from 0 to 999999 (1 million numbers)
data = pd.Series(range(1000000))

# Convert the range to a uniform distribution of random data between 1 and 100
data = data.apply(lambda x: pd.Series([np.random.randint(1, 101)])[x])

# The resulting data is a MultiIndex Series, flatten it to a regular Series
data = data.stack().astype(int)

# Create a pandas DataFrame
df = pd.DataFrame(data, columns=['Random Data'])

# Display the first few rows of the DataFrame
print(df.head())

2.
import numpy as np

# Set the seed for reproducibility (optional)
np.random.seed(0)

# Generate 1 million random data items between 1 and 100
data = np.random.randint(1, 101, size=1000000)

# The data array now contains 1 million random integers between 1 and 100

3.
import pandas as pd
import time

# Start time
start_time = time.time()

# Generate 1 million random data items using only pandas
data_pandas = pd.Series(range(1000000)).apply(lambda x: pd.Series([np.random.randint(1, 101)])[x]).stack().astype(int)

# End time
end_time = time.time()

# Calculate elapsed time
pandas_time = end_time - start_time
print(f"Time taken by Pandas: {pandas_time} seconds")

import numpy as np
import time

# Start time
start_time = time.time()

# Generate 1 million random data items using NumPy
data_numpy = np.random.randint(1, 101, size=1000000)

# End time
end_time = time.time()

# Calculate elapsed time
numpy_time = end_time - start_time
print(f"Time taken by NumPy: {numpy_time} seconds")




AIM #2: Basic statistics
For the given dataset on sleep health and lifestyle, do the following
1. Using only pandas, load the dataset, calculate mean 'Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate' and 'Daily Steps'.
2. Do the same as in Step 1 using only NumPy
3. Using only pandas, first calculate correlation (across only the numerical variables), and then separate correlation between...
    Sleep duration and Age
    Sleep duration and Heart rate
    Sleep duration and Daily steps
4. Using only NumPy, do the same as Step 3
5. Using pandas only, calculate standard deviation for 'Sleep Duration'. 
6. Usiong NumPy only, calculate standard deviation for 'Sleep Duration'. 
7. Calculate the time difference between using pandas and NumPy, right from the step of loading the dataset to the final standard deviation step. 
    5.1. Which one is faster and why?

In [None]:
1.
import pandas as pd
from io import StringIO

# The dataset is provided as a multi-line string.
# We'll use StringIO to simulate a file object from this string.
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
...
373,Female,59,Nurse,8.1,9,75,3,Overweight,140,95,68,7000,Sleep Apnea
374,Female,59,Nurse,8.1,9,75,3,Overweight,140,95,68,7000,Sleep Apnea
"""

# Use StringIO to simulate a file object from the string.
# Then, read this "file" into a pandas DataFrame.
df = pd.read_csv(StringIO(data))

# Calculate the mean for the specified columns.
mean_sleep_duration = df['Sleep Duration'].mean()
mean_systolic_bp = df['Systolic blood pressure'].mean()
mean_diastolic_bp = df['Diastolic blood pressure'].mean()
mean_heart_rate = df['Heart Rate'].mean()
mean_daily_steps = df['Daily Steps'].mean()

# Print the means.
print(f"Mean Sleep Duration: {mean_sleep_duration}")
print(f"Mean Systolic Blood Pressure: {mean_systolic_bp}")
print(f"Mean Diastolic Blood Pressure: {mean_diastolic_bp}")
print(f"Mean Heart Rate: {mean_heart_rate}")
print(f"Mean Daily Steps: {mean_daily_steps}")

2.
import numpy as np

# Loading the CSV file as an array in NumPy
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

# Create separate arrays for the individual columns that you want to operate on
# For example, to create a separate array for 'Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate', and 'Daily Steps' columns
sleep_duration = data['Sleep Duration'].astype(float)
systolic_blood_pressure = data['Systolic blood pressure'].astype(float)
diastolic_blood_pressure = data['Diastolic blood pressure'].astype(float)
heart_rate = data['Heart Rate'].astype(float)
daily_steps = data['Daily Steps'].astype(float)

# Calculate the mean for each column
mean_sleep_duration = np.mean(sleep_duration)
mean_systolic_blood_pressure = np.mean(systolic_blood_pressure)
mean_diastolic_blood_pressure = np.mean(diastolic_blood_pressure)
mean_heart_rate = np.mean(heart_rate)
mean_daily_steps = np.mean(daily_steps)

# Print the means
print(f"Mean Sleep Duration: {mean_sleep_duration:.2f}")
print(f"Mean Systolic Blood Pressure: {mean_systolic_blood_pressure:.2f}")
print(f"Mean Diastolic Blood Pressure: {mean_diastolic_blood_pressure:.2f}")
print(f"Mean Heart Rate: {mean_heart_rate:.2f}")
print(f"Mean Daily Steps: {mean_daily_steps:.2f}")

3.
import pandas as pd
from io import StringIO

# Since the dataset is provided as a string, we'll use StringIO to simulate a file.
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
... (and so on for each row of data)
"""

# Load the data into a pandas DataFrame
df = pd.read_csv(StringIO(data))

# Calculate the correlation matrix for all numerical variables
corr_matrix = df.corr()

# Print the correlation matrix
print("Correlation matrix:\n", corr_matrix)

# Calculate the specific correlations
corr_sleep_duration_age = df['Sleep Duration'].corr(df['Age'])
corr_sleep_duration_hr = df['Sleep Duration'].corr(df['Heart Rate'])
corr_sleep_duration_steps = df['Sleep Duration'].corr(df['Daily Steps'])

# Print the specific correlations
print(f"Correlation between Sleep Duration and Age: {corr_sleep_duration_age:.2f}")
print(f"Correlation between Sleep Duration and Heart Rate: {corr_sleep_duration_hr:.2f}")
print(f"Correlation between Sleep Duration and Daily Steps: {corr_sleep_duration_steps:.2f}")

4.
import numpy as np

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Use StringIO to simulate a file object from the string
from io import StringIO
data = StringIO(data)

# Load the data into a NumPy array
data = np.genfromtxt(data, delimiter=',', skip_header=1)

# Select only the numerical columns
# Assuming columns are in order: Age (1), Sleep Duration (4), Heart Rate (11), Daily Steps (12)
age = data[:, 1].astype(float)
sleep_duration = data[:, 4].astype(float)
heart_rate = data[:, 11].astype(float)
daily_steps = data[:, 12].astype(float)

# Calculate the correlation matrix for the selected numerical columns
corr_matrix = np.corrcoef(age, sleep_duration, heart_rate, daily_steps)

# Print the correlation matrix
print("Correlation matrix:\n", corr_matrix)

# Calculate the specific correlations
# Correlation between Sleep Duration and Age
corr_sleep_duration_age = corr_matrix[0, 1]

# Correlation between Sleep Duration and Heart Rate
corr_sleep_duration_hr = corr_matrix[1, 2]

# Correlation between Sleep Duration and Daily Steps
corr_sleep_duration_steps = corr_matrix[1, 3]

# Print the specific correlations
print(f"Correlation between Sleep Duration and Age: {corr_sleep_duration_age:.2f}")
print(f"Correlation between Sleep Duration and Heart Rate: {corr_sleep_duration_hr:.2f}")
print(f"Correlation between Sleep Duration and Daily Steps: {corr_sleep_duration_steps:.2f}")

5.
import pandas as pd
from io import StringIO

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Load the data into a pandas DataFrame
df = pd.read_csv(StringIO(data))

# Calculate the standard deviation for 'Sleep Duration'
std_sleep_duration = df['Sleep Duration'].std()

# Print the standard deviation
print(f"Standard Deviation of Sleep Duration: {std_sleep_duration:.2f}")

6.
import numpy as np
from io import StringIO

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Use StringIO to simulate a file object from the string
data = StringIO(data)

# Load the data into a NumPy array
np_data = np.genfromtxt(data, delimiter=',', skip_header=1)

# Select the 'Sleep Duration' column, assuming it's the 5th column (index 4)
sleep_duration = np_data[:, 4].astype(float)

# Calculate the standard deviation for 'Sleep Duration'
std_sleep_duration = np.std(sleep_duration)

# Print the standard deviation
print(f"Standard Deviation of Sleep Duration: {std_sleep_duration:.2f}")

7.
import pandas as pd
import numpy as np
from io import StringIO
import time

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Use StringIO to simulate a file object from the string
data = StringIO(data)

# Time pandas operation
start_time_pandas = time.time()

# Load the data into a pandas DataFrame
df = pd.read_csv(data)

# Calculate the standard deviation for 'Sleep Duration' using pandas
std_sleep_duration_pandas = df['Sleep Duration'].std()

# Calculate the time taken by pandas
time_pandas = time.time() - start_time_pandas
print(f"Pandas operation took {time_pandas:.2f} seconds")
print(f"Standard Deviation of Sleep Duration (Pandas): {std_sleep_duration_pandas:.2f}")

# Reset the StringIO object to the beginning for reuse
data.seek(0)

# Time NumPy operation
start_time_numpy = time.time()

# Load the data into a NumPy array
np_data = np.genfromtxt(data, delimiter=',', skip_header=1)

# Select the 'Sleep Duration' column, assuming it's the 5th column (index 4)
sleep_duration = np_data[:, 4].astype(float)

# Calculate the standard deviation for 'Sleep Duration' using NumPy
std_sleep_duration_numpy = np.std(sleep_duration)

# Calculate the time taken by NumPy
time_numpy = time.time() - start_time_numpy
print(f"NumPy operation took {time_numpy:.2f} seconds")
print(f"Standard Deviation of Sleep Duration (NumPy): {std_sleep_duration_numpy:.2f}")
# Write your code for AIM #2 here

# Loading the CSV file as an array in NumPy
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=0, dtype=None, encoding='utf-8', names=True)


# Create separate arrays for the individual columns that you want to operate on
# For example, to create a separate array for 'Age' column
age = data['Age'].astype(int)






# Calculate the correlation in NumPy using np.corrcoef()

# The correlation coefficient is located in the off-diagonal elements [0,1] or [1,0]


# Use np-std() to calculate Standard deviation



AIM #3: Use suitable plots to visualize the data

1. Using only pandas (and matplotlib/seaborn if necessary) plot the distribution for
    1.1. Age
    1.2. Sleep Duration
    1.3. Quality of Sleep
    1.4. Physical Activity Level
    1.5. Stress Level
    1.6. Heart Rate
2. Using only NumPy, do the same as Step 1. You will need matplotlib for this
3. Using only pandas, use the appropriate plot to
    3.1. See the distribution of 'Sleep Duration' based on 'Quality of Sleep'
    3.2. See the distribution of 'Sleep Duration' based on 'Stress Level'
    3.3. See the distribution of 'Sleep Duration' based on 'Physical Activity Level'
    3.4. See the distribution of 'Sleep Duration' based on 'Occupation'
    3.5. See the distribution of 'Sleep Duration' based on 'BMI'
4. Using only NumPy, do the same as Step 3. You will need matplotlib for this
5. Using only pandas, use a suitable plot to see the relation between
    5.1. Age and Sleep Duration
    5.2. Sleep Duration and Heart Rate
    5.3. Heart Rate and Daily Steps
    5.4. Sleep Duration and Daily Steps
6. Using only NumPy, do the same as Step 5. You will need matplotlib for this 
7. Find the time difference between plotting using only pandas, and plotting using NumPy

In [None]:
1.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Load the data into a pandas DataFrame
df = pd.read_csv(pd.compat.StringIO(data))

# Set the style for seaborn
sns.set_style('whitegrid')

# Plot the distribution for Age
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Plot the distribution for Sleep Duration
plt.figure(figsize=(10, 6))
sns.histplot(df['Sleep Duration'], kde=True)
plt.title('Distribution of Sleep Duration')
plt.xlabel('Sleep Duration (hours)')
plt.ylabel('Frequency')
plt.show()

# Plot the distribution for Quality of Sleep
plt.figure(figsize=(10, 6))
sns.histplot(df['Quality of Sleep'], kde=True)
plt.title('Distribution of Quality of Sleep')
plt.xlabel('Quality of Sleep')
plt.ylabel('Frequency')
plt.show()

# Plot the distribution for Physical Activity Level
plt.figure(figsize=(10, 6))
sns.histplot(df['Physical Activity Level'], kde=True)
plt.title('Distribution of Physical Activity Level')
plt.xlabel('Physical Activity Level')
plt.ylabel('Frequency')
plt.show()

# Plot the distribution for Stress Level
plt.figure(figsize=(10, 6))
sns.histplot(df['Stress Level'], kde=True)
plt.title('Distribution of Stress Level')
plt.xlabel('Stress Level')
plt.ylabel('Frequency')
plt.show()

# Plot the distribution for Heart Rate
plt.figure(figsize=(10, 6))
sns.histplot(df['Heart Rate'], kde=True)
plt.title('Distribution of Heart Rate')
plt.xlabel('Heart Rate (bpm)')
plt.ylabel('Frequency')
plt.show()

2.
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Use StringIO to simulate a file object from the string
data = StringIO(data)

# Load the data into a NumPy array
np_data = np.genfromtxt(data, delimiter=',', skip_header=1)

# Extract the necessary columns
age = np_data[:, 2].astype(float)
sleep_duration = np_data[:, 4].astype(float)
quality_of_sleep = np_data[:, 5].astype(float)
physical_activity_level = np_data[:, 6].astype(float)
stress_level = np_data[:, 7].astype(float)
heart_rate = np_data[:, 11].astype(float)

# Function to plot the distribution using histograms
def plot_distribution(data, title, bins=20):
    plt.figure(figsize=(10, 6))
    plt.hist(data, bins=bins, edgecolor='black')
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.show()

# Plot the distribution for Age
plot_distribution(age, 'Distribution of Age')

# Plot the distribution for Sleep Duration
plot_distribution(sleep_duration, 'Distribution of Sleep Duration')

# Plot the distribution for Quality of Sleep
plot_distribution(quality_of_sleep, 'Distribution of Quality of Sleep')

# Plot the distribution for Physical Activity Level
plot_distribution(physical_activity_level, 'Distribution of Physical Activity Level')

# Plot the distribution for Stress Level
plot_distribution(stress_level, 'Distribution of Stress Level')

# Plot the distribution for Heart Rate
plot_distribution(heart_rate, 'Distribution of Heart Rate')

3.
import pandas as pd
import matplotlib.pyplot as plt

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Load the data into a pandas DataFrame
df = pd.read_csv(pd.compat.StringIO(data))

# 3.1. See the distribution of 'Sleep Duration' based on 'Quality of Sleep'
plt.figure(figsize=(10, 6))
df.boxplot(column='Sleep Duration', by='Quality of Sleep')
plt.title('Distribution of Sleep Duration by Quality of Sleep')
plt.suptitle('')  # Suppress the automatic title
plt.xlabel('Quality of Sleep')
plt.ylabel('Sleep Duration (hours)')
plt.show()

# 3.2. See the distribution of 'Sleep Duration' based on 'Stress Level'
plt.figure(figsize=(10, 6))
df.boxplot(column='Sleep Duration', by='Stress Level')
plt.title('Distribution of Sleep Duration by Stress Level')
plt.suptitle('')  # Suppress the automatic title
plt.xlabel('Stress Level')
plt.ylabel('Sleep Duration (hours)')
plt.show()

# 3.3. See the distribution of 'Sleep Duration' based on 'Physical Activity Level'
plt.figure(figsize=(10, 6))
df.boxplot(column='Sleep Duration', by='Physical Activity Level')
plt.title('Distribution of Sleep Duration by Physical Activity Level')
plt.suptitle('')  # Suppress the automatic title
plt.xlabel('Physical Activity Level')
plt.ylabel('Sleep Duration (hours)')
plt.show()

# 3.4. See the distribution of 'Sleep Duration' based on 'Occupation'
plt.figure(figsize=(10, 6))
df.boxplot(column='Sleep Duration', by='Occupation')
plt.title('Distribution of Sleep Duration by Occupation')
plt.suptitle('')  # Suppress the automatic title
plt.xlabel('Occupation')
plt.ylabel('Sleep Duration (hours)')
plt.show()

# 3.5. See the distribution of 'Sleep Duration' based on 'BMI Category'
plt.figure(figsize=(10, 6))
df.boxplot(column='Sleep Duration', by='BMI Category')
plt.title('Distribution of Sleep Duration by BMI Category')
plt.suptitle('')  # Suppress the automatic title
plt.xlabel('BMI Category')
plt.ylabel('Sleep Duration (hours)')
plt.show()

4.
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Use StringIO to simulate a file object from the string
data = StringIO(data)

# Load the data into a NumPy array
np_data = np.genfromtxt(data, delimiter=',', skip_header=1)

# Extract the necessary columns
sleep_duration = np_data[:, 4].astype(float)
quality_of_sleep = np_data[:, 5].astype(int)
stress_level = np_data[:, 7].astype(int)
physical_activity_level = np_data[:, 6].astype(int)
occupation = np_data[:, 3].astype(str)
bmi_category = np_data[:, 8].astype(str)

# Create a function to plot boxplots and violinplots
def plot_categorical_distribution(x, groups, labels, title, plot_type='box'):
    plt.figure(figsize=(10, 6))
    if plot_type == 'box':
        plt.boxplot(x, labels=labels, notch=True)
    elif plot_type == 'violin':
        plt.violinplot(x, labels=labels, showmeans=True, showmedians=True)
    plt.title(title)
    plt.ylabel('Sleep Duration (hours)')
    plt.show()

# 3.1. See the distribution of 'Sleep Duration' based on 'Quality of Sleep'
unique_quality = np.unique(quality_of_sleep)
plot_categorical_distribution(
    sleep_duration,
    quality_of_sleep,
    ['QoS ' + str(q) for q in unique_quality],
    'Distribution of Sleep Duration by Quality of Sleep',
    'violin'
)

# 3.2. See the distribution of 'Sleep Duration' based on 'Stress Level'
unique_stress = np.unique(stress_level)
plot_categorical_distribution(
    sleep_duration,
    stress_level,
    ['Stress ' + str(s) for s in unique_stress],
    'Distribution of Sleep Duration by Stress Level',
    'violin'
)

# 3.3. See the distribution of 'Sleep Duration' based on 'Physical Activity Level'
unique_activity = np.unique(physical_activity_level)
plot_categorical_distribution(
    sleep_duration,
    physical_activity_level,
    ['Activity ' + str(a) for a in unique_activity],
    'Distribution of Sleep Duration by Physical Activity Level',
    'violin'
)

# 3.4. See the distribution of 'Sleep Duration' based on 'Occupation'
unique_occupation = np.unique(occupation)
plot_categorical_distribution(
    sleep_duration,
    occupation,
    unique_occupation,
    'Distribution of Sleep Duration by Occupation',
    'box'
)

# 3.5. See the distribution of 'Sleep Duration' based on 'BMI'
unique_bmi = np.unique(bmi_category)
plot_categorical_distribution(
    sleep_duration,
    bmi_category,
    unique_bmi,
    'Distribution of Sleep Duration by BMI Category',
    'box'
)

5.
import pandas as pd
import matplotlib.pyplot as plt

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Load the data into a pandas DataFrame
df = pd.read_csv(pd.compat.StringIO(data))

# 5.1. See the relation between 'Age' and 'Sleep Duration'
plt.figure(figsize=(10, 6))
plt.scatter(df['Age'], df['Sleep Duration'])
plt.title('Relation between Age and Sleep Duration')
plt.xlabel('Age')
plt.ylabel('Sleep Duration (hours)')
plt.show()

# 5.2. See the relation between 'Sleep Duration' and 'Heart Rate'
plt.figure(figsize=(10, 6))
plt.scatter(df['Sleep Duration'], df['Heart Rate'])
plt.title('Relation between Sleep Duration and Heart Rate')
plt.xlabel('Sleep Duration (hours)')
plt.ylabel('Heart Rate (bpm)')
plt.show()

# 5.3. See the relation between 'Heart Rate' and 'Daily Steps'
plt.figure(figsize=(10, 6))
plt.scatter(df['Heart Rate'], df['Daily Steps'])
plt.title('Relation between Heart Rate and Daily Steps')
plt.xlabel('Heart Rate (bpm)')
plt.ylabel('Daily Steps')
plt.show()

# 5.4. See the relation between 'Sleep Duration' and 'Daily Steps'
plt.figure(figsize=(10, 6))
plt.scatter(df['Sleep Duration'], df['Daily Steps'])
plt.title('Relation between Sleep Duration and Daily Steps')
plt.xlabel('Sleep Duration (hours)')
plt.ylabel('Daily Steps')
plt.show()

6.
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Use StringIO to simulate a file object from the string
data = StringIO(data)

# Load the data into a NumPy array
np_data = np.genfromtxt(data, delimiter=',', skip_header=1)

# Extract the necessary columns
age = np_data[:, 2].astype(float)
sleep_duration = np_data[:, 4].astype(float)
heart_rate = np_data[:, 11].astype(float)
daily_steps = np_data[:, 12].astype(float)

# Function to plot scatter plots
def plot_scatter(x, y, title, xlabel, ylabel):
    plt.figure(figsize=(10, 6))
    plt.scatter(x, y, alpha=0.5)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

# 5.1. See the relation between 'Age' and 'Sleep Duration'
plot_scatter(age, sleep_duration, 'Relation between Age and Sleep Duration', 'Age', 'Sleep Duration (hours)')

# 5.2. See the relation between 'Sleep Duration' and 'Heart Rate'
plot_scatter(sleep_duration, heart_rate, 'Relation between Sleep Duration and Heart Rate', 'Sleep Duration (hours)', 'Heart Rate (bpm)')

# 5.3. See the relation between 'Heart Rate' and 'Daily Steps'
plot_scatter(heart_rate, daily_steps, 'Relation between Heart Rate and Daily Steps', 'Heart Rate (bpm)', 'Daily Steps')

# 5.4. See the relation between 'Sleep Duration' and 'Daily Steps'
plot_scatter(sleep_duration, daily_steps, 'Relation between Sleep Duration and Daily Steps', 'Sleep Duration (hours)', 'Daily Steps')

7.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from io import StringIO

# Sample data as a string (replace this with the actual data file)
data = """
Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic blood pressure,Diastolic blood pressure,Heart Rate,Daily Steps,Sleep Disorder
1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,77,4200,None
2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,None
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,85,3000,Sleep Apnea
"""

# Load the data into a pandas DataFrame
df = pd.read_csv(StringIO(data))

# Time pandas plotting
start_time_pandas = time.time()

# Pandas plot for 'Age' and 'Sleep Duration'
df.plot(kind='scatter', x='Age', y='Sleep Duration', figsize=(10, 6))
plt.show()

time_pandas = time.time() - start_time_pandas
print(f"Pandas plotting took {time_pandas:.2f} seconds")

# Reset the StringIO object to the beginning for reuse
data.seek(0)

# Load the data into a NumPy array
np_data = np.genfromtxt(StringIO(data), delimiter=',', skip_header=1)

# Extract the necessary columns
age = np_data[:, 2].astype(float)
sleep_duration = np_data[:, 4].astype(float)

# Time NumPy plotting
start_time_numpy = time.time()

# NumPy plot for 'Age' and 'Sleep Duration'
plt.scatter(age, sleep_duration)
plt.title('Relation between Age and Sleep Duration')
plt.xlabel('Age')
plt.ylabel('Sleep Duration (hours)')
plt.show()

time_numpy = time.time() - start_time_numpy
print(f"NumPy plotting took {time_numpy:.2f} seconds")



# Write your code for AIM #3 here



#To count unique values and their total count in a NumPy array, use 
# np.unique(array_name, return_counts=True)--- This will return two values, one is the unique values and the other is the total count of each unique value
# You will need this for plotting bar plots




# For box plots in NumPy you will need to 
# create a list of variable1, grouped by the unique value of some variable2 (nominal or ordinal), 
# Syntax: variable1_list = [variable1[variable2 == i] for i in np.unique(variable2)]


AIM #4: Other possible plotting

1. Think of other possible plots to show some interesting distribution and relations. Do this using both pandas and NumPy

