# Paired Sample t-Test in Python
This notebook demonstrates how to perform a Paired Sample t-Test using synthetic GPA data.

In [1]:
import numpy as np
import pandas as pd
import pingouin as pg
import statistics 


# Load the dataset
df = pd.read_csv('seminar dataset.csv')


# Extract the relevant columns
gpa_before = df['gpa_before_it']
gpa_after = df['gpa_after_it']

# Perform the Paired Sample t-Test using Pingouin
test = pg.ttest(gpa_after, gpa_before, paired=True)

# Print the results
print(test)

# Print specific details
print(f"t-statistic: {test['T'].values[0]}")
print(f"p-value: {test['p-val'].values[0]}")


                T  dof alternative         p-val         CI95%   cohen-d  \
T-test  11.389597   99   two-sided  1.049694e-19  [0.15, 0.21]  0.375083   

             BF10     power  
T-test  4.858e+16  0.960305  
t-statistic: 11.389596584622845
p-value: 1.0496940392656813e-19


In [2]:


mean_gpa_before = statistics.mean(gpa_before)

print (mean_gpa_before)

3.2239


In [3]:

mean_gpa_after = statistics.mean(gpa_after)

print(mean_gpa_after)

3.4001


In [4]:
#load dataset
df = pd.read_csv('seminar dataset binary.csv')
# df

pg.cronbach_alpha(data=df, ci=.99)

(np.float64(0.7956328451882844), array([0.693, 0.871]))

In [27]:
import pandas as pd

# Load the dataset
df = pd.read_csv('pcc_dataset.csv')

# Select relevant columns
df_combined = df[['gpa_after_it', 'concepts_transferred_binary']]

# Drop rows with missing values
df_combined = df_combined.dropna()

# Calculate the Pearson correlation coefficient
correlation_matrix = df_combined.corr()

# Extract the Pearson correlation coefficient
pcc = correlation_matrix.loc['gpa_after_it', 'concepts_transferred_binary']

print("Pearson Correlation Coefficient:", pcc)






# df.corr() # returns a matrix with each columns correlation to all others

# PCC and p-value(significance) using Scipy
# from scipy.stats import pearsonr
# pearsonr(df['colA'], df['colB'])

# # PCC, p-value, and Confidence Level, etc. using pingouin
# from pingouin import corr
# corr(df['colA'], df['colB'])

# # PCC using researchpy
# from researchpy.correlation import corr_case
# corr_case(df[['colA','colB']])

# # PCC using Numpy
# import numpy as np
# arrayOne = np.array(df['colA'])
# arrayTwo = np.array(df['colB'])
# np.corrcoef(arrayOne, arrayTwo)

# # PCC using pyspark
# from pyspark.sql.functions import corr
# df.select(corr('colA','colB')).show()


Pearson Correlation Coefficient: 0.06240406277000611


In [29]:
import pandas as pd
import pingouin as pg

# Load the dataset
df = pd.read_csv('pcc_dataset.csv')

# Select relevant columns
df_combined = df[['gpa_after_it', 'concepts_transferred_binary']]

# Drop rows with missing values
df_combined = df_combined.dropna()

# Calculate Spearman's rank correlation coefficient
spearman_corr = pg.corr(df_combined['gpa_after_it'], df_combined['concepts_transferred_binary'], method='spearman')

print("Spearman's Rank Correlation Coefficient:", spearman_corr['r'].values[0])


Spearman's Rank Correlation Coefficient: 0.033891978431396234


In [31]:
import pandas as pd
from scipy.stats import spearmanr

# Load the dataset
df = pd.read_csv('pcc_dataset.csv')

# Calculate the change in GPA
df['gpa_change'] = df['gpa_after_it'] - df['gpa_before_it']

# Select relevant columns
df_combined = df[['gpa_change', 'concepts_transferred_binary']]

# Drop rows with missing values
df_combined = df_combined.dropna()

# Calculate Spearman's rank correlation coefficient
spearman_corr, p_value = spearmanr(df_combined['gpa_change'], df_combined['concepts_transferred_binary'])

print("Spearman's Rank Correlation Coefficient between GPA Change and Concepts Transferred:", spearman_corr)
print("p-value:", p_value)


Spearman's Rank Correlation Coefficient between GPA Change and Concepts Transferred: 0.15791553974810787
p-value: 0.0475175060369151


In [33]:
import pandas as pd
import statsmodels.api as sm

# Load the dataset
df = pd.read_csv('pcc_dataset.csv')

# Calculate the change in GPA
df['gpa_change'] = df['gpa_after_it'] - df['gpa_before_it']

# Define the independent variable(s) and dependent variable
X = df[['concepts_transferred_binary']]
X = sm.add_constant(X)  # Adds a constant term to the model
y = df['gpa_change']

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the regression
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:             gpa_change   R-squared:                       0.035
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     5.686
Date:                Mon, 02 Sep 2024   Prob (F-statistic):             0.0183
Time:                        12:44:09   Log-Likelihood:                 77.308
No. Observations:                 158   AIC:                            -150.6
Df Residuals:                     156   BIC:                            -144.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

In [34]:
# import pandas as pd
# import statsmodels.api as sm

#load dataset
# df = pd.read_csv('seminar dataset binary.csv')


# # Independent variables: Recommendation and Satisfaction Rating
# X = df[['recommend_binary', 'experience_rating_binary']]

# # Add a constant (intercept)
# X = sm.add_constant(X)

# # Dependent variable: GPA
# Y = mean_gpa_after

# # Perform the regression
# model = sm.OLS(Y, X).fit()

# # Print the summary of the regression
# print(model.summary())

# print(mean_gpa_after)