In [2]:
import pandas as pd
from scipy import stats
from scipy.stats import skew, kurtosis

# Step 2: Load the dataset
cardiogoodfitness = pd.read_csv('CardioGoodFitness.csv')

# Step 3: Exclude 'gender' and 'marital status' columns
cardiogoodfitness = cardiogoodfitness.drop(columns=['Product','Gender', 'MaritalStatus'])

# Step 4: Descriptive statistics
# Measures of central tendency
mean_values = cardiogoodfitness.mean()
median_values = cardiogoodfitness.median()
mode_values = cardiogoodfitness.mode().iloc[0]  # Mode may have multiple values, so select the first one

# Measures of variability
range_values = cardiogoodfitness.max() - cardiogoodfitness.min()
variance_values = cardiogoodfitness.var()
std_dev_values = cardiogoodfitness.std()

# Measures of shape
skewness_values = skew(cardiogoodfitness, nan_policy='omit')
kurtosis_values = kurtosis(cardiogoodfitness, nan_policy='omit')

# Step 5: Inferential statistics
# Example: Hypothesis testing for correlation between age and miles
age = cardiogoodfitness['Age']
miles = cardiogoodfitness['Miles']
correlation_coefficient, p_value = stats.pearsonr(age, miles)

# Example: Predictions based on linear regression
# (You can replace this with appropriate prediction method for your dataset)
# For example, predicting 'miles' based on 'age'
slope, intercept, r_value, p_value, std_err = stats.linregress(age, miles)
predicted_miles = slope * age + intercept

# Example: Probability calculation
# (You can replace this with appropriate probability calculation for your dataset)
# For example, probability of income being above a certain threshold
income = cardiogoodfitness['Income']
probability_above_threshold = sum(income > 50000) / len(income)

# Example: Identifying significant differences or relationships
# (You can replace this with appropriate test for your dataset)
# For example, conducting a t-test to compare 'usage' between two groups
group1 = cardiogoodfitness[cardiogoodfitness['Education'] > 15]
group2 = cardiogoodfitness[cardiogoodfitness['Education'] <= 15]
t_statistic, p_value = stats.ttest_ind(group1['Usage'], group2['Usage'])

# Print results
print("Descriptive Statistics:")
print("Mean:")
print(mean_values)
print("\nMedian:")
print(median_values)
print("\nMode:")
print(mode_values)
print("\nRange:")
print(range_values)
print("\nVariance:")
print(variance_values)
print("\nStandard Deviation:")
print(std_dev_values)
print("\nSkewness:")
print(skewness_values)
print("\nKurtosis:")
print(kurtosis_values)

print("\nInferential Statistics:")
print("Correlation coefficient between age and miles:", correlation_coefficient)
print("p-value for correlation test:", p_value)
print("\nPredicted miles based on age:")
print(predicted_miles)
print("\nProbability of income above $50,000:")
print(probability_above_threshold)
print("\nT-statistic for usage between Education less than 15 and greater than 15 groups:")
print(t_statistic)
print("p-value for t-test:", p_value)


Descriptive Statistics:
Mean:
Age             28.788889
Education       15.572222
Usage            3.455556
Fitness          3.311111
Income       53719.577778
Miles          103.194444
dtype: float64

Median:
Age             26.0
Education       16.0
Usage            3.0
Fitness          3.0
Income       50596.5
Miles           94.0
dtype: float64

Mode:
Age             25
Education       16
Usage            3
Fitness          3
Income       45480
Miles           85
Name: 0, dtype: int64

Range:
Age             32
Education        9
Usage            5
Fitness          4
Income       75019
Miles          339
dtype: int64

Variance:
Age          4.821217e+01
Education    2.614867e+00
Usage        1.176785e+00
Fitness      9.194289e-01
Income       2.724706e+08
Miles        2.689833e+03
dtype: float64

Standard Deviation:
Age              6.943498
Education        1.617055
Usage            1.084797
Fitness          0.958869
Income       16506.684226
Miles           51.863605
dtype: float

In [29]:
import pandas as pd
from scipy import stats
from scipy.stats import skew, kurtosis
from sklearn import preprocessing

In [30]:
cardiogoodfitness = pd.read_csv('CardioGoodFitness.csv')

In [32]:
cardiogoodfitness.shape

(180, 9)

In [33]:
cardiogoodfitness.columns

Index(['Product', 'Age', 'Gender', 'Education', 'MaritalStatus', 'Usage',
       'Fitness', 'Income', 'Miles'],
      dtype='object')

In [34]:
cardiogoodfitness.head()

Unnamed: 0,Product,Age,Gender,Education,MaritalStatus,Usage,Fitness,Income,Miles
0,TM195,18,Male,14,Single,3,4,29562,112
1,TM195,19,Male,15,Single,2,3,31836,75
2,TM195,19,Female,14,Partnered,4,3,30699,66
3,TM195,19,Male,12,Single,3,3,32973,85
4,TM195,20,Male,13,Partnered,4,2,35247,47


In [35]:
le = preprocessing.LabelEncoder()

cardiogoodfitness['Gender'] = le.fit_transform(cardiogoodfitness['Gender'])
cardiogoodfitness['MaritalStatus'] = le.fit_transform(cardiogoodfitness['MaritalStatus'])

cardiogoodfitness.head()


Unnamed: 0,Product,Age,Gender,Education,MaritalStatus,Usage,Fitness,Income,Miles
0,TM195,18,1,14,1,3,4,29562,112
1,TM195,19,1,15,1,2,3,31836,75
2,TM195,19,0,14,0,4,3,30699,66
3,TM195,19,1,12,1,3,3,32973,85
4,TM195,20,1,13,0,4,2,35247,47


In [36]:
cardiogoodfitness = cardiogoodfitness.drop(columns=['Product'])

In [38]:
cardiogoodfitness.describe()

Unnamed: 0,Age,Gender,Education,MaritalStatus,Usage,Fitness,Income,Miles
count,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0
mean,28.788889,0.577778,15.572222,0.405556,3.455556,3.311111,53719.577778,103.194444
std,6.943498,0.495291,1.617055,0.492369,1.084797,0.958869,16506.684226,51.863605
min,18.0,0.0,12.0,0.0,2.0,1.0,29562.0,21.0
25%,24.0,0.0,14.0,0.0,3.0,3.0,44058.75,66.0
50%,26.0,1.0,16.0,0.0,3.0,3.0,50596.5,94.0
75%,33.0,1.0,16.0,1.0,4.0,4.0,58668.0,114.75
max,50.0,1.0,21.0,1.0,7.0,5.0,104581.0,360.0


In [39]:
# Step 4: Descriptive statistics
# Measures of central tendency
mean_values = cardiogoodfitness.mean()
median_values = cardiogoodfitness.median()
mode_values = cardiogoodfitness.mode().iloc[0]  # Mode may have multiple values, so select the first one

# Measures of variability
range_values = cardiogoodfitness.max() - cardiogoodfitness.min()
variance_values = cardiogoodfitness.var()
std_dev_values = cardiogoodfitness.std()

# Measures of shape
skewness_values = skew(cardiogoodfitness, nan_policy='omit')
kurtosis_values = kurtosis(cardiogoodfitness, nan_policy='omit')

# Step 5: Inferential statistics
# Example: Hypothesis testing for correlation between age and miles
age = cardiogoodfitness['Age']
miles = cardiogoodfitness['Miles']
correlation_coefficient, p_value = stats.pearsonr(age, miles)

# Example: Predictions based on linear regression
# (You can replace this with appropriate prediction method for your dataset)
# For example, predicting 'miles' based on 'age'
slope, intercept, r_value, p_value, std_err = stats.linregress(age, miles)
predicted_miles = slope * age + intercept

# Example: Probability calculation
# (You can replace this with appropriate probability calculation for your dataset)
# For example, probability of income being above a certain threshold
income = cardiogoodfitness['Income']
probability_above_threshold = sum(income > 50000) / len(income)

# Example: Identifying significant differences or relationships
# (You can replace this with appropriate test for your dataset)
# For example, conducting a t-test to compare 'usage' between two groups
group1 = cardiogoodfitness[cardiogoodfitness['Education'] > 15]
group2 = cardiogoodfitness[cardiogoodfitness['Education'] <= 15]
t_statistic, p_value = stats.ttest_ind(group1['Usage'], group2['Usage'])

In [40]:
# Print results
print("Descriptive Statistics:")
print("Mean:")
print(mean_values)
print("\nMedian:")
print(median_values)
print("\nMode:")
print(mode_values)
print("\nRange:")
print(range_values)
print("\nVariance:")
print(variance_values)
print("\nStandard Deviation:")
print(std_dev_values)
print("\nSkewness:")
print(skewness_values)
print("\nKurtosis:")
print(kurtosis_values)

print("\nInferential Statistics:")
print("Correlation coefficient between age and miles:", correlation_coefficient)
print("p-value for correlation test:", p_value)
print("\nPredicted miles based on age:")
print(predicted_miles)
print("\nProbability of income above $50,000:")
print(probability_above_threshold)
print("\nT-statistic for usage between Education less than 15 and greater than 15 groups:")
print(t_statistic)
print("p-value for t-test:", p_value)


Descriptive Statistics:
Mean:
Age                 28.788889
Gender               0.577778
Education           15.572222
MaritalStatus        0.405556
Usage                3.455556
Fitness              3.311111
Income           53719.577778
Miles              103.194444
dtype: float64

Median:
Age                 26.0
Gender               1.0
Education           16.0
MaritalStatus        0.0
Usage                3.0
Fitness              3.0
Income           50596.5
Miles               94.0
dtype: float64

Mode:
Age                 25
Gender               1
Education           16
MaritalStatus        0
Usage                3
Fitness              3
Income           45480
Miles               85
Name: 0, dtype: int64

Range:
Age                 32
Gender               1
Education            9
MaritalStatus        1
Usage                5
Fitness              4
Income           75019
Miles              339
dtype: int64

Variance:
Age              4.821217e+01
Gender           2.453135e-01
Ed