<a href="https://colab.research.google.com/github/nagesh0024/statistics/blob/main/Statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Q1. Generate a list of 100 integers containing values between 90 to 130 and store it in the variable int_list. After generating the list, find the following:

###(i) Write a Python function to calculate the mean of a given list of numbers. Create a function to find the median of a list of numbers.

In [45]:
int_list = np.random.randint(90, 130, 100)

def calculate_mean(numbers):
  if len(numbers) == 0:
    return None
  return sum(numbers) / len(numbers)

def calculate_median(numbers):
  if len(numbers) == 0:
    return None

  sorted_numbers = sorted(numbers)
  n = len(sorted_numbers)
  mid = n // 2

  if n % 2 == 0:
    return (sorted_numbers[mid - 1] + sorted_numbers[mid]) / 2
  else:
    return sorted_numbers[mid]

mean = calculate_mean(int_list)
median = calculate_median(int_list)

###(ii) Develop a program to compute the mode of a list of integers.

In [46]:
from collections import Counter

def calculate_mode(numbers):
  if len(numbers) == 0:
     return None

  count = Counter(numbers)
  max_count = max(count.values())

  mode = [num for num, freq in count.items() if freq == max_count]
  return mode

###(iii) Implement a function to calculate the weighted mean of a list of values and their corresponding weights.

In [47]:
def calculate_weighted_mean(values, weights):
  if len(values) != len(weights) or len(values) == 0:
     return None
  weighted_sum = sum(value * weight for value, weight in zip(values, weights))
  return weighted_sum / sum(weights)

###(iv) Write a Python function to find the geometric mean of a list of positive numbers.

In [48]:
import math

def calculate_geometric_mean(numbers):
  if len(numbers) == 0:
    return None
  product = math.prod(numbers)
  return product ** (1 / len(numbers))

###(v) Create a program to calculate the harmonic mean of a list of values.

In [49]:
def calculate_harmonic_mean(numbers):
  if len(numbers) == 0:
    return None
  return len(numbers) / sum(1 / x for x in numbers)

###(vi) Build a function to determine the midrange of a list of numbers (average of the minimum and maximum).

In [50]:
def calculate_midrange(numbers):
  if len(numbers) == 0:
    return None
  return (min(numbers) + max(numbers)) / 2

###(vii) Implement a Python program to find the trimmed mean of a list, excluding a certain percentage of outliers.

In [51]:
def calculate_trimmed_mean(numbers, percentage):
  if len(numbers) == 0 or percentage < 0 or percentage > 100:
    return None

  sorted_numbers = sorted(numbers)
  trim_count = int(len(sorted_numbers) * (percentage / 100))

  trimmed_numbers = sorted_numbers[trim_count:-trim_count]
  return calculate_mean(trimmed_numbers)

#Q2. Generate a list of 500 integers containing values between 200 to 300 and store it in the variable int_list2. After generating the list, find the following:

###(i) Compare the given list of visualization for the given data:

    

###1. Frequency & Gaussian distribution

###2. Frequency smoothened KDE plot

###3. Gaussian distribution & smoothened KDE plot

In [None]:
int_list2 = np.random.randint(200, 300, 500)

from scipy.stats import norm

data = np.array(int_list2)

# Plot 1: Frequency Histogram & Gaussian Distribution
plt.figure(figsize=(14, 6))

# Histogram
plt.subplot(1, 2, 1)
sns.histplot(data, bins=20, kde=False, color='skyblue', stat='density')
plt.title('Frequency Histogram')

# Gaussian Distribution
mu, std = norm.fit(data)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2, color='red')
plt.title('Frequency Histogram & Gaussian Distribution')
plt.xlabel('Value')
plt.ylabel('Density')

# Plot 2: Frequency Smoothened KDE plot
plt.subplot(1, 2, 2)
sns.histplot(data, bins=20, kde=True, color='skyblue', stat='density')
plt.title('Frequency Histogram with KDE')
plt.xlabel('Value')
plt.ylabel('Density')

plt.tight_layout()
plt.show()

# Plot 3: Gaussian Distribution & Smoothened KDE plot
plt.figure(figsize=(7, 6))
sns.kdeplot(data, color='blue', linewidth=2, label='KDE')
plt.plot(x, p, 'k', linewidth=2, color='red', label='Gaussian')
plt.title('Gaussian Distribution & KDE Plot')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()

plt.show()

###(ii) Write a Python function to calculate the range of a given list of numbers.

In [None]:
def calculate_range(numbers):
  return max(numbers) - min(numbers)

range_of_int_list2 = calculate_range(int_list2)
print(f"The range of int_list2 is: {range_of_int_list2}")

###(iii) Create a program to find the variance and standard deviation of a list of numbers.

In [54]:
def calculate_variance(numbers):

  mean = sum(numbers) / len(numbers)
  squared_diffs = [(x - mean) ** 2 for x in numbers]
  variance = sum(squared_diffs) / len(numbers)
  return variance

def calculate_standard_deviation(numbers):
  variance = calculate_variance(numbers)
  if variance is None:
    return None
  return math.sqrt(variance)
variance_of_int_list2 = calculate_variance(int_list2)
standard_deviation_of_int_list2 = calculate_standard_deviation(int_list2)

###(iv) Implement a function to compute the interquartile range (IQR) of a list of values

In [55]:
def calculate_iqr(numbers):

  sorted_numbers = sorted(numbers)
  q1 = np.percentile(sorted_numbers, 25)
  q3 = np.percentile(sorted_numbers, 75)
  iqr = q3 - q1
  return iqr

iqr_of_int_list2 = calculate_iqr(int_list2)

###(v) Build a program to calculate the coefficient of variation for a dataset.

In [56]:
def calculate_coefficient_of_variation(numbers):
  mean = calculate_mean(numbers)
  if mean is None or mean == 0:
    return None
  std_dev = calculate_standard_deviation(numbers)
  if std_dev is None:
    return None
  cv = (std_dev / mean) * 100
  return cv

cv_of_int_list2 = calculate_coefficient_of_variation(int_list2)

###(vi) Write a Python function to find the mean absolute deviation (MAD) of a list of numbers.

In [57]:
def calculate_mean_absolute_deviation(numbers):
  mean = calculate_mean(numbers)
  if mean is None:
    return None

  absolute_deviations = [abs(x - mean) for x in numbers]

  mad = np.mean(absolute_deviations)

  return mad

mad_of_int_list2 = calculate_mean_absolute_deviation(int_list2)

###(vii) Create a program to calculate the quartile deviation of a list of values.

In [58]:
def calculate_quartile_deviation(numbers):

  sorted_numbers = sorted(numbers)
  q1 = np.percentile(sorted_numbers, 25)
  q3 = np.percentile(sorted_numbers, 75)
  iqr = q3 - q1
  quartile_deviation = iqr / 2
  return quartile_deviation

quartile_deviation_of_int_list2 = calculate_quartile_deviation(int_list2)

###(viii) Implement a function to find the range-based coefficient of dispersion for a dataset.

In [59]:
def calculate_range(numbers):
  return np.max(numbers) - np.min(numbers)

def calculate_coefficient_of_dispersion(numbers):
  mean = calculate_mean(numbers)
  if mean is None or mean == 0:
    return None

  data_range = calculate_range(numbers)
  if data_range is None:
    return None

  coefficient_of_dispersion = (data_range / mean) * 100
  return coefficient_of_dispersion

coefficient_of_dispersion_of_int_list2 = calculate_coefficient_of_dispersion(int_list2)

#Q3. Write a Python class representing a discrete random variable with methods to calculate its expected value and variance.

In [60]:
class DiscreteRandomVariable:
  def __init__(self, probabilities, values):
    if len(probabilities) != len(values):
      raise ValueError("Length of probabilities and values must be the same.")
    if not (0 < sum(probabilities) <= 1):
      raise ValueError("Sum of probabilities must be between 0 and 1.")
    if any(p < 0 for p in probabilities):
      raise ValueError("Probabilities must be non-negative.")

    self.probabilities = probabilities
    self.values = values

  def expected_value(self):
    return sum(p * v for p, v in zip(self.probabilities, self.values))

  def variance(self):
    mean = self.expected_value()
    return sum(p * (v - mean) ** 2 for p, v in zip(self.probabilities, self.values))

#Q4. Implement a program to simulate the rolling of a fair six-sided die and calculate the expected value and variance of the outcomes.

In [61]:
import random
def simulate_die_rolls(num_rolls):
  outcomes = [random.randint(1, 6) for _ in range(num_rolls)]
  return outcomes

# Calculate the probabilities of each outcome
def calculate_probabilities(outcomes):
  frequency = {i: outcomes.count(i) for i in range(1, 7)}
  total_rolls = len(outcomes)
  probabilities = [frequency[i] / total_rolls for i in range(1, 7)]
  return probabilities

# Main program
num_rolls = 10000  # Number of rolls to simulate
outcomes = simulate_die_rolls(num_rolls)

# Values of a six-sided die
values = [1, 2, 3, 4, 5, 6]
probabilities = calculate_probabilities(outcomes)

# Create DiscreteRandomVariable instance
die = DiscreteRandomVariable(probabilities, values)

# Calculate expected value and variance
expected_value = die.expected_value()
variance = die.variance()

print("Expected Value:", expected_value)
print("Variance:", variance)

Expected Value: 3.4922
Variance: 2.92233916


#Q5.  Create a Python function to generate random samples from a given probability distribution (e.g., binomial, Poisson) and calculate their mean and variance.

In [62]:
def generate_random_samples(distribution, params, size=1000):

  if distribution == 'binomial':
    n = params.get('n', 1)
    p = params.get('p', 0.5)
    samples = np.random.binomial(n, p, size)
  elif distribution == 'poisson':
    lam = params.get('lam', 1.0)
    samples = np.random.poisson(lam, size)
  elif distribution == 'normal':
    mu = params.get('mu', 0.0)
    sigma = params.get('sigma', 1.0)
    samples = np.random.normal(mu, sigma, size)
  else:
    raise ValueError("Unsupported distribution type. Supported types: 'binomial', 'poisson', 'normal'.")

  mean = np.mean(samples)
  variance = np.var(samples)

  return mean, variance

#Q6. Write a Python script to generate random numbers from a Gaussian (normal) distribution and compute the mean, variance, and standard deviation of the samples.


In [63]:
def generate_normal_samples(mu, sigma, size=1000):

  samples = np.random.normal(mu, sigma, size)
  mean = np.mean(samples)
  variance = np.var(samples)
  std_deviation = np.std(samples)

  return mean, variance, std_deviation

mu = 0
sigma = 1
sample_size = 1000

mean, variance, std_deviation = generate_normal_samples(mu, sigma, sample_size)

#Q7. Use seaborn library to load tips dataset. Find the following from the dataset for the columns total_bill and tip:

###(i) Write a Python function that calculates their skewness.

In [None]:
import scipy.stats as stats

def calculate_skewness(data, column):
  return stats.skew(data[column])

tips = sns.load_dataset('tips')

total_bill_skewness = calculate_skewness(tips, 'total_bill')
tip_skewness = calculate_skewness(tips, 'tip')

print(f"Skewness of 'total_bill': {total_bill_skewness}")
print(f"Skewness of 'tip': {tip_skewness}")

###(ii) Create a program that determines whether the columns exhibit positive skewness, negative skewness, or is approximately symmetric.

In [65]:
def determine_skewness_type(skewness):

  if skewness > 0:
    return "Positive skewness"
  elif skewness < 0:
    return "Negative skewness"
  else:
    return "Approximately symmetric"

total_bill_skewness = calculate_skewness(tips, 'total_bill')
tip_skewness = calculate_skewness(tips, 'tip')

total_bill_skewness_type = determine_skewness_type(total_bill_skewness)
tip_skewness_type = determine_skewness_type(tip_skewness)

###(iii) Write a function that calculates the covariance between two columns.

In [None]:
def calculate_covariance(data, column1, column2):
  return np.cov(data[column1], data[column2], bias=True)[0][1]

covariance_total_bill_tip = calculate_covariance(tips, 'total_bill', 'tip')

print(f"Covariance between 'total_bill' and 'tip': {covariance_total_bill_tip}")

###(iv) Implement a Python program that calculates the Pearson correlation coefficient between two columns.

In [None]:
def calculate_pearson_correlation(data, column1, column2):

  correlation, p_value = stats.pearsonr(data[column1], data[column2])
  return correlation, p_value

correlation_total_bill_tip, p_value_total_bill_tip = calculate_pearson_correlation(tips, 'total_bill', 'tip')

print(f"Pearson correlation between 'total_bill' and 'tip': {correlation_total_bill_tip}")

###(v) Write a script to visualize the correlation between two specific columns in a Pandas DataFrame using scatter plots.

In [None]:
def visualize_correlation(data, column1, column2):

  plt.figure(figsize=(10, 6))
  sns.scatterplot(data=data, x=column1, y=column2)
  plt.title(f'Scatter Plot of {column1} vs {column2}')
  plt.xlabel(column1)
  plt.ylabel(column2)
  plt.grid(True)
  plt.show()

visualize_correlation(tips, 'total_bill', 'tip')

#Q8. Write a Python function to calculate the probability density function (PDF) of a continuous random variable for a given normal distribution.

In [None]:
def normal_pdf(x, mu, sigma):

  return stats.norm.pdf(x, loc=mu, scale=sigma)

mu = 0
sigma = 1
x_values = np.linspace(-5, 5, 100)  # Generate 100 values from -5 to 5

pdf_values = normal_pdf(x_values, mu, sigma)

print("x values:", x_values)
print("PDF values:", pdf_values)

#Q9. Create a program to calculate the cumulative distribution function (CDF) of exponential distribution.

In [None]:
def exponential_cdf(x, lambda_param):

  return stats.expon.cdf(x, scale=1/lambda_param)

lambda_param = 1
x_values = np.linspace(0, 5, 100)  # Generate 100 values from 0 to 5

cdf_values = exponential_cdf(x_values, lambda_param)

print("x values:", x_values)
print("CDF values:", cdf_values)

#Q10. Write a Python function to calculate the probability mass function (PMF) of Poisson distribution.

In [None]:
def poisson_pmf(k, lambda_param):

  return stats.poisson.pmf(k, lambda_param)

lambda_param = 4
k_values = np.arange(0, 15)  # Generate integer values from 0 to 14

pmf_values = poisson_pmf(k_values, lambda_param)

print("k values:", k_values)
print("PMF values:", pmf_values)

#Q11. A company wants to test if a new website layout leads to a higher conversion rate (percentage of visitors who make a purchase). They collect data from the old and new layouts to compare.

###To generate the data use the following command:

```python

import numpy as np

# 50 purchases out of 1000 visitors
old_layout = np.array([1] * 50 + [0] * 950)

# 70 purchases out of 1000 visitors  
new_layout = np.array([1] * 70 + [0] * 930)

  ```

###Apply z-test to find which layout is successful.

In [None]:
old_layout = np.array([1] * 50 + [0] * 950)
new_layout = np.array([1] * 70 + [0] * 930)

# Proportions
p_old = np.mean(old_layout)
p_new = np.mean(new_layout)

# Pooled proportion
p_pooled = (np.sum(old_layout) + np.sum(new_layout)) / (len(old_layout) + len(new_layout))

# Standard error
SE = np.sqrt(p_pooled * (1 - p_pooled) * (1/len(old_layout) + 1/len(new_layout)))

# Z-score
z_score = (p_old - p_new) / SE

# P-value
p_value = 2 * (1 - stats.norm.cdf(np.abs(z_score)))

(p_old, p_new, z_score, p_value)

#Q12. A tutoring service claims that its program improves students' exam scores. A sample of students who participated in the program was taken, and their scores before and after the program were recorded.

###Use the below code to ge<erate samples of respective arrays of marks:

```python

before_program = np.array([75, 80, 85, 70, 90, 78, 92, 88, 82, 87])

after_program = np.array([80, 85, 90, 80, 92, 80, 95, 90, 85, 88])

```

###Use z-test to fi<d if the claims made by tutor are true or false.

In [None]:
before_program = np.array([75, 80, 85, 70, 90, 78, 92, 88, 82, 87])
after_program = np.array([80, 85, 90, 80, 92, 80, 95, 90, 85, 88])

# Differences
differences = after_program - before_program

# Mean and standard deviation of differences
mean_diff = np.mean(differences)
std_diff = np.std(differences, ddof=1)

# Standard error
SE_diff = std_diff / np.sqrt(len(differences))

# Z-score
z_score = mean_diff / SE_diff

# P-value
p_value = 2 * (1 - stats.norm.cdf(np.abs(z_score)))

(mean_diff, std_diff, z_score, p_value)

#Q13.A pharmaceutical company wants to determine if a new drug is effective in reducing blood pressure. They conduct a study and record blood pressure measurements before and aNter administering the drug.

###Use the below code to generate samples of respective arrays of blood pressure:


```python

before_drug = np.array([145, 150, 140, 135, 155, 160, 152, 148, 130, 138])

after_drug = np.array([130, 140, 132, 128, 145, 148, 138, 136, 125, 130])

  ```


###Implement z-test to find if the drug really works or not.

In [None]:
before_drug = np.array([145, 150, 140, 135, 155, 160, 152, 148, 130, 138])
after_drug = np.array([130, 140, 132, 128, 145, 148, 138, 136, 125, 130])

# Differences
differences = after_drug - before_drug

# Mean and standard deviation of differences
mean_diff = np.mean(differences)
std_diff = np.std(differences, ddof=1)

# Standard error
SE_diff = std_diff / np.sqrt(len(differences))

# Z-score
z_score = mean_diff / SE_diff

# P-value
p_value = 2 * (1 - stats.norm.cdf(np.abs(z_score)))

(mean_diff, std_diff, z_score, p_value)

#Q14. A customer service department claims that their average response time is less than 5 minutes. A sample of recent customer interactions was taken, and the response times were recorded.

###Implement the below code to generate the array of response time:

```python

response_times = np.array([4.3, 3.8, 5.1, 4.9, 4.7, 4.2, 5.2, 4.5, 4.6, 4.4])

```

###Implement z-test to find the claims made by customer service department are true or false.

In [None]:
response_times = np.array([4.3, 3.8, 5.1, 4.9, 4.7, 4.2, 5.2, 4.5, 4.6, 4.4])

# Hypothesized population mean
mu_0 = 5

# Calculate sample statistics
sample_mean = np.mean(response_times)
sample_std = np.std(response_times, ddof=1)
n = len(response_times)

# Perform the z-test
z_value = (sample_mean - mu_0) / (sample_std / np.sqrt(n))

# Calculate the p-value for a one-tailed test
p_value = stats.norm.cdf(z_value)

print(f"Sample Mean: {sample_mean:.2f}")
print(f"Sample Standard Deviation: {sample_std:.2f}")
print(f"Z-value: {z_value:.2f}")
print(f"P-value: {p_value:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p_value < alpha:
  print("Reject the null hypothesis. The claim that the average response time is less than 5 minutes is supported.")
else:
  print("Fail to reject the null hypothesis. There is not enough evidence to support the claim that the average response time is less than 5 minutes.")

#Q15. A company is testing two different website layouts to see which one leads to higher click-through rates. Write a Python function to perform an A/B test analysis, including calculating the t-statistic, degrees of freedom, and p-value.

###Use the following data:

```python

layout_a_clicks = [28, 32, 33, 29, 31, 34, 30, 35, 36, 37]

layout_b_clicks = [40, 41, 38, 42, 39, 44, 43, 41, 45, 47]
```

In [None]:

def ab_test(layout_a_clicks, layout_b_clicks):
  # Convert lists to numpy arrays
  layout_a_clicks = np.array(layout_a_clicks)
  layout_b_clicks = np.array(layout_b_clicks)

  # Calculate sample means
  mean_a = np.mean(layout_a_clicks)
  mean_b = np.mean(layout_b_clicks)

  # Calculate sample standard deviations
  std_a = np.std(layout_a_clicks, ddof=1)
  std_b = np.std(layout_b_clicks, ddof=1)

  # Calculate sample sizes
  n_a = len(layout_a_clicks)
  n_b = len(layout_b_clicks)

  # Calculate t-statistic
  t_statistic = (mean_a - mean_b) / np.sqrt((std_a**2 / n_a) + (std_b**2 / n_b))

  # Calculate degrees of freedom
  df_num = ((std_a**2 / n_a) + (std_b**2 / n_b))**2
  df_den = ((std_a**2 / n_a)**2 / (n_a - 1)) + ((std_b**2 / n_b)**2 / (n_b - 1))
  degrees_of_freedom = df_num / df_den

  # Calculate p-value
  p_value = 2 * (1 - stats.t.cdf(np.abs(t_statistic), df=degrees_of_freedom))

  return t_statistic, degrees_of_freedom, p_value

# Given data
layout_a_clicks = [28, 32, 33, 29, 31, 34, 30, 35, 36, 37]
layout_b_clicks = [40, 41, 38, 42, 39, 44, 43, 41, 45, 47]

# Perform A/B test
t_stat, df, p_val = ab_test(layout_a_clicks, layout_b_clicks)

print(f"T-statistic: {t_stat:.2f}")
print(f"Degrees of Freedom: {df:.2f}")
print(f"P-value: {p_val:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p_val < alpha:
  print("Reject the null hypothesis. There is a significant difference between the two layouts.")
else:
  print("Fail to reject the null hypothesis. There is no significant difference between the two layouts.")

#Q16. A pharmaceutical company wants to determine if a new drug is more effective than an existing drug in reducing cholesterol levels. Create a program to analyze the clinical trial data and calculate the tstatistic and p-value for the treatment effect.

###Use the following data of cholestrol level:

```python

existing_drug_levels = [180, 182, 175, 185, 178, 176, 172, 184, 179, 183]

new_drug_levels = [170, 172, 165, 168, 175, 173, 170, 178, 172, 176]
```

In [None]:

def analyze_drug_effectiveness(existing_drug_levels, new_drug_levels):
  # Convert lists to numpy arrays
  existing_drug_levels = np.array(existing_drug_levels)
  new_drug_levels = np.array(new_drug_levels)

  # Calculate sample means
  mean_existing = np.mean(existing_drug_levels)
  mean_new = np.mean(new_drug_levels)

  # Calculate sample standard deviations
  std_existing = np.std(existing_drug_levels, ddof=1)
  std_new = np.std(new_drug_levels, ddof=1)

  # Calculate sample sizes
  n_existing = len(existing_drug_levels)
  n_new = len(new_drug_levels)

  # Calculate t-statistic
  t_statistic = (mean_existing - mean_new) / np.sqrt((std_existing**2 / n_existing) + (std_new**2 / n_new))

  # Calculate degrees of freedom
  df_num = ((std_existing**2 / n_existing) + (std_new**2 / n_new))**2
  df_den = ((std_existing**2 / n_existing)**2 / (n_existing - 1)) + ((std_new**2 / n_new)**2 / (n_new - 1))
  degrees_of_freedom = df_num / df_den

  # Calculate p-value for one-tailed test
  p_value = stats.t.cdf(t_statistic, df=degrees_of_freedom)

  return t_statistic, degrees_of_freedom, p_value

# Given data
existing_drug_levels = [180, 182, 175, 185, 178, 176, 172, 184, 179, 183]
new_drug_levels = [170, 172, 165, 168, 175, 173, 170, 178, 172, 176]

# Perform analysis
t_stat, df, p_val = analyze_drug_effectiveness(existing_drug_levels, new_drug_levels)

print(f"T-statistic: {t_stat:.2f}")
print(f"Degrees of Freedom: {df:.2f}")
print(f"P-value: {p_val:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p_val < alpha:
  print("Reject the null hypothesis. The new drug is more effective in reducing cholesterol levels.")
else:
  print("Fail to reject the null hypothesis. There is not enough evidence to say the new drug is more effective.")

#Q17. A school district introduces an educational intervention program to improve math scores. Write a Python function to analyze pre- and post-intervention test scores, calculating the t-statistic and p-value to determine if the intervention had a significant impact.

###Use the following data of test score:


  ```python

  pre_intervention_scores = [80, 85, 90, 75, 88, 82, 92, 78, 85, 87]

  post_intervention_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]
  ```

In [None]:
def analyze_intervention(pre_scores, post_scores):
  # Convert lists to numpy arrays
  pre_scores = np.array(pre_scores)
  post_scores = np.array(post_scores)

  # Calculate the differences
  differences = post_scores - pre_scores

  # Calculate mean and standard deviation of the differences
  mean_difference = np.mean(differences)
  std_difference = np.std(differences, ddof=1)

  # Calculate sample size
  n = len(differences)

  # Calculate t-statistic
  t_statistic = mean_difference / (std_difference / np.sqrt(n))

  # Calculate degrees of freedom
  degrees_of_freedom = n - 1

  # Calculate p-value for one-tailed test
  p_value = 1 - stats.t.cdf(t_statistic, df=degrees_of_freedom)

  return t_statistic, degrees_of_freedom, p_value

# Given data
pre_intervention_scores = [80, 85, 90, 75, 88, 82, 92, 78, 85, 87]
post_intervention_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]

# Perform analysis
t_stat, df, p_val = analyze_intervention(pre_intervention_scores, post_intervention_scores)

print(f"T-statistic: {t_stat:.2f}")
print(f"Degrees of Freedom: {df}")
print(f"P-value: {p_val:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p_val < alpha:
  print("Reject the null hypothesis. The intervention had a significant impact on math scores.")
else:
  print("Fail to reject the null hypothesis. There is not enough evidence to say the intervention had a significant impact.")

#Q18. An HR department wants to investigate if there's a gender-based salary gap within the company. Develop a program to analyze salary data, calculate the t-statistic, and determine if there's a statistically significant difference between the average salaries of male and female employees.

###Use the below code to generate synthetic data:
```python

# Generate synthetic salary data for male and female employees

np.random.seed(0)  # For reproducibility

male_salaries = np.random.normal(loc=50000, scale=10000, size=20)

female_salaries = np.random.normal(loc=55000, scale=9000, size=20)
```

In [None]:
np.random.seed(0)  # For reproducibility

male_salaries = np.random.normal(loc=50000, scale=10000, size=20)
female_salaries = np.random.normal(loc=55000, scale=9000, size=20)

def analyze_gender_salary_gap(male_salaries, female_salaries):
  # Calculate sample means
  mean_male = np.mean(male_salaries)
  mean_female = np.mean(female_salaries)

  # Calculate sample standard deviations
  std_male = np.std(male_salaries, ddof=1)
  std_female = np.std(female_salaries, ddof=1)

  # Calculate sample sizes
  n_male = len(male_salaries)
  n_female = len(female_salaries)

  # Calculate t-statistic
  t_statistic = (mean_male - mean_female) / np.sqrt((std_male**2 / n_male) + (std_female**2 / n_female))

  # Calculate degrees of freedom
  df_num = ((std_male**2 / n_male) + (std_female**2 / n_female))**2
  df_den = ((std_male**2 / n_male)**2 / (n_male - 1)) + ((std_female**2 / n_female)**2 / (n_female - 1))
  degrees_of_freedom = df_num / df_den

  # Calculate p-value for two-tailed test
  p_value = 2 * (1 - stats.t.cdf(np.abs(t_statistic), df=degrees_of_freedom))

  return t_statistic, degrees_of_freedom, p_value

# Perform analysis
t_stat, df, p_val = analyze_gender_salary_gap(male_salaries, female_salaries)

print(f"T-statistic: {t_stat:.2f}")
print(f"Degrees of Freedom: {df:.2f}")
print(f"P-value: {p_val:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p_val < alpha:
  print("Reject the null hypothesis. There is a significant difference in average salaries between male and female employees.")
else:
  print("Fail to reject the null hypothesis. There is no significant difference in average salaries between male and female employees.")

#Q19.  A manufacturer produces two different versions of a product and wants to compare their quality scores. Create a Python function to analyze quality assessment data, calculate the t-statistic, and decide whether there's a significant difference in quality between the two versions.

###Use the following data:
```python

version1_scores = [85, 88, 82, 89, 87, 84, 90, 88, 85, 86, 91, 83, 87, 84, 89, 86, 84, 88, 85, 86, 89, 90, 87, 88, 85]

version2_scores = [80, 78, 83, 81, 79, 82, 76, 80, 78, 81, 77, 82, 80, 79, 82, 79, 80, 81, 79, 82, 79, 78, 80, 81, 82]
```

In [None]:
def analyze_quality_scores(version1_scores, version2_scores):
  # Convert lists to numpy arrays
  version1_scores = np.array(version1_scores)
  version2_scores = np.array(version2_scores)

  # Calculate sample means
  mean_v1 = np.mean(version1_scores)
  mean_v2 = np.mean(version2_scores)

  # Calculate sample standard deviations
  std_v1 = np.std(version1_scores, ddof=1)
  std_v2 = np.std(version2_scores, ddof=1)

  # Calculate sample sizes
  n_v1 = len(version1_scores)
  n_v2 = len(version2_scores)

  # Calculate t-statistic
  t_statistic = (mean_v1 - mean_v2) / np.sqrt((std_v1**2 / n_v1) + (std_v2**2 / n_v2))

  # Calculate degrees of freedom
  df_num = ((std_v1**2 / n_v1) + (std_v2**2 / n_v2))**2
  df_den = ((std_v1**2 / n_v1)**2 / (n_v1 - 1)) + ((std_v2**2 / n_v2)**2 / (n_v2 - 1))
  degrees_of_freedom = df_num / df_den

  # Calculate p-value for two-tailed test
  p_value = 2 * (1 - stats.t.cdf(np.abs(t_statistic), df=degrees_of_freedom))

  return t_statistic, degrees_of_freedom, p_value

# Given data
version1_scores = [85, 88, 82, 89, 87, 84, 90, 88, 85, 86, 91, 83, 87, 84, 89, 86, 84, 88, 85, 86, 89, 90, 87, 88, 85]
version2_scores = [80, 78, 83, 81, 79, 82, 76, 80, 78, 81, 77, 82, 80, 79, 82, 79, 80, 81, 79, 82, 79, 78, 80, 81, 82]

# Perform analysis
t_stat, df, p_val = analyze_quality_scores(version1_scores, version2_scores)

print(f"T-statistic: {t_stat:.2f}")
print(f"Degrees of Freedom: {df:.2f}")
print(f"P-value: {p_val:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p_val < alpha:
  print("Reject the null hypothesis. There is a significant difference in quality scores between the two versions.")
else:
  print("Fail to reject the null hypothesis. There is no significant difference in quality scores between the two versions.")

#Q20. A restaurant chain collects customer satisfaction scores for two different branches. Write a program to analyze the scores, calculate the t-statistic, and determine if there's a statistically significant difference in customer satisfaction between the branches.

###Use the below data of scores:

  ```python

branch_a_scores = [4, 5, 3, 4, 5, 4, 5, 3, 4, 4, 5, 4, 4, 3, 4, 5, 5, 4, 3, 4, 5, 4, 3, 5, 4, 4, 5, 3, 4, 5, 4]

branch_b_scores = [3, 4, 2, 3, 4, 3, 4, 2, 3, 3, 4, 3, 3, 2, 3, 4, 4, 3, 2, 3, 4, 3, 2, 4, 3, 3, 4, 2, 3, 4, 3]
```

In [None]:

def analyze_customer_satisfaction(branch_a_scores, branch_b_scores):
  # Convert lists to numpy arrays
  branch_a_scores = np.array(branch_a_scores)
  branch_b_scores = np.array(branch_b_scores)

  # Calculate sample means
  mean_a = np.mean(branch_a_scores)
  mean_b = np.mean(branch_b_scores)

  # Calculate sample standard deviations
  std_a = np.std(branch_a_scores, ddof=1)
  std_b = np.std(branch_b_scores, ddof=1)

  # Calculate sample sizes
  n_a = len(branch_a_scores)
  n_b = len(branch_b_scores)

  # Calculate t-statistic
  t_statistic = (mean_a - mean_b) / np.sqrt((std_a**2 / n_a) + (std_b**2 / n_b))

  # Calculate degrees of freedom
  df_num = ((std_a**2 / n_a) + (std_b**2 / n_b))**2
  df_den = ((std_a**2 / n_a)**2 / (n_a - 1)) + ((std_b**2 / n_b)**2 / (n_b - 1))
  degrees_of_freedom = df_num / df_den

  # Calculate p-value for two-tailed test
  p_value = 2 * (1 - stats.t.cdf(np.abs(t_statistic), df=degrees_of_freedom))

  return t_statistic, degrees_of_freedom, p_value

# Given data
branch_a_scores = [4, 5, 3, 4, 5, 4, 5, 3, 4, 4, 5, 4, 4, 3, 4, 5, 5, 4, 3, 4, 5, 4, 3, 5, 4, 4, 5, 3, 4, 5, 4]
branch_b_scores = [3, 4, 2, 3, 4, 3, 4, 2, 3, 3, 4, 3, 3, 2, 3, 4, 4, 3, 2, 3, 4, 3, 2, 4, 3, 3, 4, 2, 3, 4, 3]

# Perform analysis
t_stat, df, p_val = analyze_customer_satisfaction(branch_a_scores, branch_b_scores)

print(f"T-statistic: {t_stat:.2f}")
print(f"Degrees of Freedom: {df:.2f}")
print(f"P-value: {p_val:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p_val < alpha:
  print("Reject the null hypothesis. There is a significant difference in customer satisfaction between the two branches.")
else:
  print("Fail to reject the null hypothesis. There is no significant difference in customer satisfaction between the two branches.")

#Q21. A political analyst wants to determine if there is a significant association between age groups and voter preferences (Candidate A or Candidate B). They collect data from a sample of 500 voters and classify them into different age groups and candidate preferences. Perform a Chi-Square test to determine if there is a significant association between age groups and voter preferences.

###Use the below code to generate data:

```python

np.random.seed(0)

age_groups = np.random.choice([ 18 30 , 31 50 , 51+', 51+'], size=30)

voter_preferences = np.random.choice(['Candidate A', 'Candidate B'], size=30)
```

In [None]:
np.random.seed(0)

age_groups = np.random.choice(['18-30', '31-50', '51+'], size=500)
voter_preferences = np.random.choice(['Candidate A', 'Candidate B'], size=500)

# Create a DataFrame to hold the data
data = pd.DataFrame({'Age Group': age_groups, 'Voter Preference': voter_preferences})

# Create the contingency table
contingency_table = pd.crosstab(data['Age Group'], data['Voter Preference'])

# Perform the Chi-Square test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

# Print the results
print("Contingency Table:")
print(contingency_table)
print("\nChi-Square Test Results:")
print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"Degrees of Freedom: {dof}")
print(f"P-value: {p:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p < alpha:
  print("Reject the null hypothesis. There is a significant association between age groups and voter preferences.")
else:
  print("Fail to reject the null hypothesis. There is no significant association between age groups and voter preferences.")

#Q22.  A company conducted a customer satisfaction survey to determine if there is a significant relationship between product satisfaction levels (Satisfied, Neutral, Dissatisfied) and the region where customers are located (East, West, North, South). The survey data is summarized in a contingency table. Conduct a ChiSquare test to determine if there is a significant relationship between product satisfaction levels and customer regions.

###Sample data:

```python

#Sample data: Product satisfaction levels (rows) vs. Customer regions (columns)

data = np.array([[50, 30, 40, 20], [30, 40, 30, 50], [20, 30, 40, 30]])
```

In [None]:
data = np.array([[50, 30, 40, 20],
                 [30, 40, 30, 50],
                 [20, 30, 40, 30]])

# Perform the Chi-Square test
chi2, p, dof, expected = stats.chi2_contingency(data)

# Print the results
print("Contingency Table:")
print(data)
print("\nChi-Square Test Results:")
print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"Degrees of Freedom: {dof}")
print(f"P-value: {p:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p < alpha:
  print("Reject the null hypothesis. There is a significant relationship between product satisfaction levels and customer regions.")
else:
  print("Fail to reject the null hypothesis. There is no significant relationship between product satisfaction levels and customer regions.")

#Q23. A company implemented an employee training program to improve job performance (Effective, Neutral, Ineffective). After the training, they collected data from a sample of employees and classified them based on their job performance before and after the training. Perform a Chi-Square test to determine if there is a significant difference between job performance levels before and after the training.

###Sample data:

```python

# Sample data: Job performance levels before (rows) and after (columns) training

data = np.array([[50, 30, 20], [30, 40, 30], [20, 30, 40]])
```

In [None]:
data = np.array([[50, 30, 20],
                 [30, 40, 30],
                 [20, 30, 40]])

# Perform the Chi-Square test
chi2, p, dof, expected = stats.chi2_contingency(data)

# Print the results
print("Contingency Table:")
print(data)
print("\nChi-Square Test Results:")
print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"Degrees of Freedom: {dof}")
print(f"P-value: {p:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p < alpha:
  print("Reject the null hypothesis. There is a significant difference in job performance levels before and after the training.")
else:
  print("Fail to reject the null hypothesis. There is no significant difference in job performance levels before and after the training.")

#Q24. A company produces three different versions of a product: Standard, Premium, and Deluxe. The company wants to determine if there is a significant difference in customer satisfaction scores among the three product versions. They conducted a survey and collected customer satisfaction scores for each version from a random sample of customers. Perform an ANOVA test to determine if there is a significant difference in customer satisfaction scores

### Use the following data:

  ```python

  # Sample data: Customer satisfaction scores for each product version

  standard_scores = [80, 85, 90, 78, 88, 82, 92, 78, 85, 87]

  premium_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]

  deluxe_scores = [95, 98, 92, 97, 96, 94, 98, 97, 92, 99]
  ```

In [None]:
standard_scores = [80, 85, 90, 78, 88, 82, 92, 78, 85, 87]
premium_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]
deluxe_scores = [95, 98, 92, 97, 96, 94, 98, 97, 92, 99]

# Perform the one-way ANOVA test
f_statistic, p_value = stats.f_oneway(standard_scores, premium_scores, deluxe_scores)

# Print the results
print("ANOVA Test Results:")
print(f"F-statistic: {f_statistic:.2f}")
print(f"P-value: {p_value:.4f}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p_value < alpha:
  print("Reject the null hypothesis. There is a significant difference in customer satisfaction scores among the three product versions.")
else:
  print("Fail to reject the null hypothesis. There is no significant difference in customer satisfaction scores among the three product versions.")