In [None]:
# The code is creating bins for different age groups and grouping the dataframe "df" by those bins. 
# For each age group, it calculates the Pearson correlation coefficient and p-value between the "raw_scores" and "predicted_percentiles" columns.
# It also calculates the actual percentiles, differences between actual and predicted percentiles, mean difference, and standard deviation of the differences.
# It then prints out the age group, correlation, p-value, mean difference, and standard deviation of differences.
# Finally, it checks if the correlation is greater than 0.7 and the p-value is less than 0.05, and prints whether the predicted percentiles are a good indicator of the raw scores for that age group or not.


# This version will create uncorrelated and should lead to a poor performance. 

import pandas as pd
import numpy as np

np.random.seed(42)
ages = np.random.randint(20,80, 183)
raw_scores = np.random.normal(500, 100, 183)
predicted_percentiles = np.random.uniform(10,90, 183)

data = {'age': ages, 'raw_scores': raw_scores, 'predicted_percentiles': predicted_percentiles}
df = pd.DataFrame(data)

print(df.head())

In [None]:
from scipy.stats import pearsonr

correlation, pvalue = pearsonr(df['raw_scores'], df['predicted_percentiles'])
print("Correlation:", correlation)
print("P-value:", pvalue)

In [None]:
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]
age_groups = df.groupby(pd.cut(df['age'], bins))

for name, group in age_groups:
    correlation, pvalue = pearsonr(group['raw_scores'], group['predicted_percentiles'])
    actual_percentiles = np.percentile(group['raw_scores'], group['predicted_percentiles'])
    differences = actual_percentiles - group['predicted_percentiles']
    mean_difference = np.mean(differences)
    std_difference = np.std(differences)
    print(f"Age group {name}")
    print("Correlation:", correlation)
    print("P-value:", pvalue)
    print("Mean difference:", mean_difference)
    print("Std difference:", std_difference)

In [None]:
# Create a list of bins for different age groups
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]

# Group the dataframe "df" by age using the bins defined above
age_groups = df.groupby(pd.cut(df['age'], bins))

# Iterate through each age group
for name, group in age_groups:
    # Calculate the Pearson correlation coefficient and p-value between the "raw_scores" and "predicted_percentiles" columns
    correlation, pvalue = pearsonr(group['raw_scores'], group['predicted_percentiles'])
    # Calculate the actual percentiles using the raw_scores and predicted_percentiles
    actual_percentiles = np.percentile(group['raw_scores'], group['predicted_percentiles'])
    # Calculate the differences between actual and predicted percentiles
    differences = actual_percentiles - group['predicted_percentiles']
    # Calculate the mean difference
    mean_difference = np.mean(differences)
    # Calculate the standard deviation of the differences
    std_difference = np.std(differences)
    # Print the age group name
    print(f"Age group {name}")
    # Print the correlation
    print("Correlation:", correlation)
    # Print the p-value
    print("P-value:", pvalue)
    # Print the mean difference
    print("Mean difference:", mean_difference)
    # Print the standard deviation of the differences
    print("Std difference:", std_difference)
    # Check if the correlation is greater than 0.7 and the p-value is less than 0.05
    # not, could also add check for reasonable means and std
    if abs(correlation) > 0.7 and pvalue < 0.05: # and abs(mean_difference) < 3 and abs(std_difference) < 3
        # If true, print that the predicted percentiles are a good indicator of the raw scores for this age group
        print("The predicted percentiles are a good indicator of the raw scores for this age group.")
    else:
        # If false, print that the predicted percentiles are not a good indicator of the raw scores for this age group
        print("The predicted percentiles are not a good indicator of the raw scores for this age group.")


In [None]:
# This code imports the statsmodels library and uses the ordinary least squares (ols) 
# function from the formula api to fit a linear regression model to the data in the dataframe 'df'.

# First, the code creates a new column 'age_groups' in the dataframe 'df' by using the 
# pandas cut function to divide the 'age' column into groups based on the specified 'bins'.

# Then, the code uses the ols function to fit a linear regression model with 'raw_scores' 
# as the dependent variable and 'age_groups' and 'predicted_percentiles' as predictors. The 'data' parameter is set to the dataframe 'df' so that the function can access the data.

# Finally, the summary of the linear regression model is printed. The summary includes information 
# such as the coefficients of the predictors, the p-values, the R-squared value, and the F-statistic, 
# among other things.

# It's important to note that this code assumes that there are no missing values in the dataframe 
# and that the 'age_groups' and 'predicted_percentiles' columns have been correctly computed and cleaned.

import statsmodels.formula.api as smf

# create a new column to store age groups
df['age_groups'] = pd.cut(df['age'], bins)

# fit a linear regression model with age groups and predicted percentiles as predictors
model = smf.ols(formula='raw_scores ~ age_groups + predicted_percentiles', data=df)
results = model.fit()

# print the summary of the model
print(results.summary())

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# Create an instance of the KFold class with the number of splits set to the number of rows in the DataFrame
kf = KFold(n_splits=df.shape[0])

# Initialize an empty list to store the errors
errors = []

i = 0 # counter variable

# Iterate over the train and test indices generated by the KFold instance
for train_index, test_index in kf.split(df):
    # Create a train dataset by selecting the rows in the DataFrame corresponding to the train indices
    train_data = df.iloc[train_index]
    # Create a test dataset by selecting the rows in the DataFrame corresponding to the test indices
    test_data = df.iloc[test_index]
    # Fit the linear regression model using the train dataset
    model = smf.ols(formula='raw_scores ~ age_groups + predicted_percentiles', data=train_data)
    results = model.fit()
    # Generate predictions for the test dataset
    predictions = results.predict(test_data[['age_groups','predicted_percentiles']])
    # Append the mean absolute error of the predictions to the errors list
    errors.append(mean_absolute_error(test_data['raw_scores'], predictions))
    # Print the summary of the model
    # print(results.summary())
    if i < 2: # only print the first two results
        print(results.summary())
    i += 1

# Print the mean absolute error of all the predictions
print("Mean Absolute Error:", np.mean(errors))

In [46]:

# Create an instance of the KFold class with the number of splits set to 5
kf = KFold(n_splits=5)

# Iterate over the train and test indices generated by the KFold instance
for train_index, test_index in kf.split(df):
    # Create a train dataset by selecting the rows in the DataFrame corresponding to the train indices
    train_data = df.iloc[train_index]
    # Create a test dataset by selecting the rows in the DataFrame corresponding to the test indices
    test_data = df.iloc[test_index]
    # Fit the linear regression model using the train dataset
    model = smf.ols(formula='raw_scores ~ age_groups + predicted_percentiles', data=train_data)
    results = model.fit()
    # Generate predictions for the test dataset
    predictions = results.predict(test_data[['age_groups','predicted_percentiles']])
    # Compare the predictions to the actual raw scores in the test dataset and record the errors
    # or other relevant evaluation metric, you can use the test_data['raw_scores'] to compare with predictions
    # Append the mean absolute error of the predictions to the errors list
    errors.append(mean_absolute_error(test_data['raw_scores'], predictions))
    # Print the summary of the model
    # print(results.summary())
    print(results.summary())

# Print the mean absolute error of all the predictions
print("Mean Absolute Error:", np.mean(errors))

                            OLS Regression Results                            
Dep. Variable:             raw_scores   R-squared:                       0.066
Model:                            OLS   Adj. R-squared:                 -0.019
Method:                 Least Squares   F-statistic:                    0.7802
Date:                Mon, 23 Jan 2023   Prob (F-statistic):              0.670
Time:                        20:55:51   Log-Likelihood:                -876.86
No. Observations:                 146   AIC:                             1780.
Df Residuals:                     133   BIC:                             1819.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                                     coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------