In [1]:
# Import packages
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn as sns

### Questions:
1. NULL cells and question columns.. do we need them? For log reg and lin reg you already have cols selected?
1. Should we only be looking at rows where progress=100, gender filled out, and years of teaching filled out? Had to remove 30 rows for progress<100, 2 other rows for not filling out questions and/or gender/teaching experience.
1. Lots of cleaning up to do for form fill data (any column with _TEXT)
1. There are duplicate columns (i.e. "Q26")... for now im going to assume the later columns are for teacher knowledge of ADHD.

In [12]:
answer_key = {"26": 1, "27": 1, "28": 2, "29": 1, "30": 2, "31": 1, "32": 2, "33": 2, "34": 2, "35": 2, "36": 1, "37": 1, "38": 2, "39": 2, "40": 1, "41": 2, "42": 1, "43": 2, "44": 2, "45": 2, "46": 2, "47": 1, "48": 1, "49": 2, "50": 2}

In [52]:
df = pd.read_csv('adhd.csv')
print("There are", df.shape[0], "responses in the original file. We will only be analyzing teacher's who completed the survey.")
df = df[(df['Progress']==100) & (df['Q4'].notnull()) & (df['Q8'].notnull())].reset_index(drop=True)
print("Analyzing", df.shape[0], "responses.")

There are 82 responses in the original file. We will only be analyzing teacher's who completed the survey.
Analyzing 50 responses.


In [120]:
print("Cleaning up data...")
# Clean up vignettes to use gender
df["student_gender"] = np.where((df["FL_16_DO"] == "Block3") | (df["FL_16_DO"] == "Block6"), "male", "female")

# Clean up teaching years col
df.loc[df['Q8']=='16 years', 'Q8'] = 16
df['years_of_teaching'] = np.where(df["Q8"] < 5, "<5 Years",
                                   np.where((df["Q8"]>=5) & (df["Q8"]<=10), "5-10 Years",
                                           np.where(df["Q8"]>10, ">10 Years","Other")))


# df.astype({'Q8':'int32'})
df['Q8'] = df['Q8'].astype('int')

# Clean up teacher gender
df["teacher_gender"] = np.where(df["Q4"] == 1, "male", "female")


df.head()

Cleaning up data...


Unnamed: 0,Progress,Duration (in seconds),Finished,ResponseId,Q1,Q4,Q4_6_TEXT,Q5,Q8,Q9,...,Q46,Q47,Q48,Q49,Q50,FL_16_DO,teacher_knowledge,student_gender,years_of_teaching,teacher_gender
0,100,570,1,R_2XmMOwXkxl03RS1,1,2.0,,26.0,5,Elementary and early childhood education (BS),...,2.0,2.0,1.0,2.0,2.0,Block3,20,male,5-10 Years,female
1,100,319,1,R_323Zdlt2e8eOHAZ,1,2.0,,25.0,4,Bachelors degree- early childhood education,...,2.0,2.0,1.0,2.0,1.0,Block3,21,male,<5 Years,female
2,100,275,1,R_3KOu9RKNFFUTc9T,1,2.0,,41.0,11,M Ed,...,2.0,1.0,1.0,1.0,2.0,Block7,23,female,>10 Years,female
3,100,530,1,R_31j7NdjFkRxxvT5,1,2.0,,27.0,6,Childhood Special Education,...,2.0,2.0,1.0,2.0,2.0,Block5,24,female,5-10 Years,female
4,100,346,1,R_2tEx8uK4n6xh7HF,1,2.0,,23.0,2,,...,2.0,1.0,1.0,1.0,1.0,Block5,20,female,<5 Years,female


In [121]:
print("Calculating teacher knowledge of ADHD...")
grades = []
for i, row in df.iterrows():
    curr_grade = 0
    for question_num in range(26,51):
        col = 'Q'+str(question_num)
        if row[col]==answer_key[str(question_num)]:
            curr_grade += 1
    grades.append(curr_grade)
    if curr_grade == 0:
        print(row)
df['teacher_knowledge'] = grades

Calculating teacher knowledge of ADHD...


In [126]:
# column_definitions = [gender, years of teaching, teacher knowledge of ADHD, vignette]
# regression_cols = ['Q4','Q8','teacher_knowledge','years_of_teaching','teacher_gender','student_gender', 'FL_16_DO', 'Q25_1']
regression_cols = ['teacher_knowledge','years_of_teaching','teacher_gender','student_gender', 'Q25_1']
df_v0 = df.loc[:,regression_cols]
# df_v0 = df_v0.rename(columns={'Q4':'teacher_gender', 'Q8':'years_of_teaching', 'FL_16_DO':'vignette', 'Q25_1':'degree_of_need' })
df_v0 = df_v0.rename(columns={ 'Q25_1':'degree_of_need' })
df_v0.head()

Unnamed: 0,teacher_knowledge,years_of_teaching,teacher_gender,student_gender,degree_of_need
0,20,5-10 Years,female,male,7.0
1,21,<5 Years,female,male,7.0
2,23,>10 Years,female,female,6.0
3,24,5-10 Years,female,female,10.0
4,20,<5 Years,female,female,4.0


In [123]:
model_0 = ols('degree_of_need ~ teacher_gender+years_of_teaching+student_gender+teacher_knowledge', data=df_v0).fit()
model_0.summary()

0,1,2,3
Dep. Variable:,degree_of_need,R-squared:,0.131
Model:,OLS,Adj. R-squared:,0.032
Method:,Least Squares,F-statistic:,1.329
Date:,"Thu, 09 Feb 2023",Prob (F-statistic):,0.27
Time:,00:25:37,Log-Likelihood:,-105.88
No. Observations:,50,AIC:,223.8
Df Residuals:,44,BIC:,235.2
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,9.2310,3.754,2.459,0.018,1.665,16.797
teacher_gender[T.male],-1.1460,1.400,-0.819,0.417,-3.967,1.675
years_of_teaching[T.<5 Years],-2.3528,0.936,-2.514,0.016,-4.239,-0.467
years_of_teaching[T.>10 Years],-0.8827,0.696,-1.268,0.212,-2.286,0.520
student_gender[T.male],-0.3987,0.660,-0.604,0.549,-1.730,0.932
teacher_knowledge,-0.0623,0.174,-0.358,0.722,-0.413,0.288

0,1,2,3
Omnibus:,1.402,Durbin-Watson:,1.843
Prob(Omnibus):,0.496,Jarque-Bera (JB):,1.103
Skew:,-0.362,Prob(JB):,0.576
Kurtosis:,2.935,Cond. No.,260.0
