# Practice importing and analyzing a dataset using statsmodel
Using a data set with variables from my research, data fabricated from ChatGPT for practice purposes only.
Skills practiced:
- Reading in dataset
- Dealing with missing data (continuous and categorical)
- Running simple linear regression
- Recoding categorical variables
- Running simple linear regression while controlling for age and gender

In [1]:
# Changing the Current Working Directory
import os
# Change the Current Working Directory
# Specify the new directory path
new_directory = '/Users/mickey.rice/Desktop'

# Change the current working directory
os.chdir(new_directory)

# Verify the change
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)

Current Working Directory: /Users/mickey.rice/Desktop


In [2]:
# Load the CSV File
import pandas as pd
csv_file_path = 'PracticeData.csv'
PracticeData_csv = pd.read_csv(csv_file_path)
print("Practice Data:")
display(PracticeData_csv.head())

Practice Data:


Unnamed: 0,ID,Accuracy,RT,PHQ9,CESD,DASS,PRMQ1,Confidence,JOL,N300,PP,Age,Sex,Gender,Race,Ethnicity,Handedness
0,1.0,97.0,289.0,10.0,50.0,,15.0,85.0,63.0,351.0,132.0,18.0,Male,Female,Asian,Hispanic or Latino,Right
1,2.0,89.0,344.0,6.0,,11.0,13.0,50.0,95.0,311.0,121.0,27.0,,Non-binary,Black or African American,Hispanic or Latino,Ambidextrous
2,3.0,74.0,278.0,,45.0,25.0,13.0,81.0,82.0,223.0,127.0,26.0,Female,Male,Other,Hispanic or Latino,Ambidextrous
3,4.0,94.0,349.0,14.0,31.0,4.0,13.0,64.0,76.0,262.0,111.0,42.0,,Other,Black or African American,Hispanic or Latino,Right
4,5.0,84.0,269.0,4.0,49.0,15.0,19.0,85.0,75.0,326.0,116.0,31.0,Female,Other,Other,Hispanic or Latino,Left


## Dealing with missing data


In [3]:
import numpy as np
PracticeData_csv.isnull().sum()  # Returns the count of missing values in each column

ID             0
Accuracy      10
RT            10
PHQ9          10
CESD          10
DASS          10
PRMQ1         10
Confidence    10
JOL           10
N300          10
PP            10
Age           10
Sex           10
Gender        10
Race          10
Ethnicity     10
Handedness    10
dtype: int64

In [4]:
# Fill in the missing data with interpolation for numerical columns
PracticeData_csv['Accuracy'].interpolate(method='linear', inplace=True)
PracticeData_csv['RT'].interpolate(method='linear', inplace=True)
PracticeData_csv['PHQ9'].interpolate(method='linear', inplace=True)
PracticeData_csv['CESD'].interpolate(method='linear', inplace=True)
PracticeData_csv['DASS'].interpolate(method='linear', inplace=True)
PracticeData_csv['PRMQ1'].interpolate(method='linear', inplace=True)
PracticeData_csv['Confidence'].interpolate(method='linear', inplace=True)
PracticeData_csv['JOL'].interpolate(method='linear', inplace=True)
PracticeData_csv['N300'].interpolate(method='linear', inplace=True)
PracticeData_csv['PP'].interpolate(method='linear', inplace=True)
PracticeData_csv['Age'].interpolate(method='linear', inplace=True)

# Fill missing values for gender with unknown
PracticeData_csv['Gender'] = PracticeData_csv['Gender'].fillna('Unknown')

# Check if there are any missing values left
PracticeData_csv.isnull().sum()

## Other options: drop missing data (drop rows)
### PracticeData_csv.dropna(inplace=True)  

PracticeData_csv.isnull().sum() 

ID             0
Accuracy       0
RT             0
PHQ9           0
CESD           0
DASS           1
PRMQ1          0
Confidence     0
JOL            0
N300           0
PP             0
Age            0
Sex           10
Gender         0
Race          10
Ethnicity     10
Handedness    10
dtype: int64

## Regression using the statsmodel package
### Controlling for nothing

In [5]:
# Calculate a p-value
import statsmodels.api as sm

# Standardize PHQ9
PracticeData_csv_standardized = PracticeData_csv.copy()
PracticeData_csv_standardized['PHQ9'] = (PracticeData_csv['PHQ9'] - PracticeData_csv['PHQ9'].mean()) / PracticeData_csv['PHQ9'].std()

# Separate X (predictors) and y (target) from standardized dataframe
PHQ9 = PracticeData_csv_standardized[['PHQ9']]
Accuracy = PracticeData_csv_standardized['Accuracy']

# Add constant to the X variable for intercept
PHQ9 = sm.add_constant(PHQ9)

# Fit OLS model
model = sm.OLS(Accuracy, PHQ9)
results = model.fit()

# Print regression summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               Accuracy   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     2.701
Date:                Wed, 10 Jul 2024   Prob (F-statistic):              0.104
Time:                        12:50:41   Log-Likelihood:                -354.53
No. Observations:                 100   AIC:                             713.1
Df Residuals:                      98   BIC:                             718.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         83.9200      0.847     99.089      0.0

### Controlling for age and gender

In [7]:
# Code categorical variable (Gender) using one-hot encoding

# Recode 'non-binary' to 'other'
PracticeData_csv['Gender'] = PracticeData_csv['Gender'].replace('Non-binary', 'Other')

# One-hot encode the gender variable
PracticeData_csv = pd.get_dummies(PracticeData_csv, columns=['Gender'], drop_first=True)

# Display the first few rows to see the changes
PracticeData_csv.head()

Unnamed: 0,ID,Accuracy,RT,PHQ9,CESD,DASS,PRMQ1,Confidence,JOL,N300,PP,Age,Sex,Race,Ethnicity,Handedness,Gender_Male,Gender_Other,Gender_Unknown
0,1.0,97.0,289.0,10.0,50.0,,15.0,85.0,63.0,351.0,132.0,18.0,Male,Asian,Hispanic or Latino,Right,0,0,0
1,2.0,89.0,344.0,6.0,47.5,11.0,13.0,50.0,95.0,311.0,121.0,27.0,,Black or African American,Hispanic or Latino,Ambidextrous,0,1,0
2,3.0,74.0,278.0,10.0,45.0,25.0,13.0,81.0,82.0,223.0,127.0,26.0,Female,Other,Hispanic or Latino,Ambidextrous,1,0,0
3,4.0,94.0,349.0,14.0,31.0,4.0,13.0,64.0,76.0,262.0,111.0,42.0,,Black or African American,Hispanic or Latino,Right,0,1,0
4,5.0,84.0,269.0,4.0,49.0,15.0,19.0,85.0,75.0,326.0,116.0,31.0,Female,Other,Hispanic or Latino,Left,0,1,0


In [11]:
# Use statsmodel to calculate the p-value

from sklearn.preprocessing import StandardScaler

# Separate predictors (X) and target (y)
X = PracticeData_csv[['PHQ9', 'Age', 'Gender_Male', 'Gender_Other', 'Gender_Unknown']].copy()  # Make a copy to avoid modifying the original dataframe
y = PracticeData_csv['Accuracy']

# Standardize only continuous variables (PHQ9 and Age)
scaler = StandardScaler()
X.loc[:, ['PHQ9', 'Age']] = scaler.fit_transform(X[['PHQ9', 'Age']])  # Use .loc to assign values

# Add constant to X for intercept term
X = sm.add_constant(X)

# Fit OLS (Ordinary Least Squares) model
model = sm.OLS(y, X)
results = model.fit()

# Print summary which includes coefficients, standard errors, p-values, etc.
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               Accuracy   R-squared:                       0.040
Model:                            OLS   Adj. R-squared:                 -0.011
Method:                 Least Squares   F-statistic:                    0.7837
Date:                Wed, 10 Jul 2024   Prob (F-statistic):              0.564
Time:                        12:57:05   Log-Likelihood:                -353.84
No. Observations:                 100   AIC:                             719.7
Df Residuals:                      94   BIC:                             735.3
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             84.2839      1.666     50.