In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

import glob

from sklearn.decomposition import PCA

#from pandas_profiling import ProfileReport

# Import Exported CSV files from Previous Notebooks

In [2]:
# Read hdr_df.csv
human_development_reports_df = pd.read_csv('hdr_df.csv')

# Read country_ids.csv
country_ids = pd.read_csv('country_ids.csv')

# Read Codebook

## 3 columns to focus on: Variable, Label, Value Scheme Detailed

In [3]:
def read_codebook(file, sheet1, sheet2):
    xls = pd.ExcelFile(file)
    student = pd.read_excel(xls, sheet1)
    school = pd.read_excel(xls, sheet2)
    
    return student, school

In [4]:
stu_con_4, sch_con_4 = read_codebook('T19_G4_Codebooks/T19_G4_Codebook.xlsx', 'ACGM7', 'ASGM7')

# Read Main Data

In [5]:
ACG_files = glob.glob('T19_G4_SAS Data/ACG' + "/*.sas7bdat")

ACG_list = []
count_ACG_list = []

for file in ACG_files:
    ACG = pd.read_sas(file)
    ACG_list.append(ACG)

#All BCG files; total of 14,391 rows × 102 columns
ACG_merged_all = pd.concat(ACG_list, axis=0, ignore_index=True)

In [6]:
ASG_files = glob.glob('T19_G4_SAS Data/ASG' + "/*.sas7bdat")

ASG_list_239 = []
ASG_list_159 = []

for file in ASG_files:
    if len((pd.read_sas(file)).columns) == 239:
        ASG_239 = pd.read_sas(file)
        ASG_list_239.append(ASG_239)
        
    elif len((pd.read_sas(file)).columns) == 159:
        ASG_159 = pd.read_sas(file)
        ASG_list_159.append(ASG_159)

#ASG files with 239 columns; total of 339,811 rows × 239 columns
#ASG files with 159 columns; total of 47,416 rows × 159 columns
ASG_merged_239 = pd.concat(ASG_list_239, axis=0, ignore_index=True)
ASG_merged_159 = pd.concat(ASG_list_159, axis=0, ignore_index=True)

#All ASG files; total of 387,227 rows × 239 columns
ASG_merged_all = pd.concat([ASG_merged_159, ASG_merged_239], axis=0, ignore_index=True)

In [7]:
df_4 = ASG_merged_all.merge(ACG_merged_all, how='inner', on=['IDCNTRY', 'IDSCHOOL'])

## Specify columns to use for grade 4

In [8]:
#Specify columns for PCA
shortage_cols = [col for col in df_4.columns if 'CBG13' in col]
school_problem_cols = [col for col in df_4.columns if 'CBG15' in col]
primary_compedency_cols = [col for col in df_4.columns if 'CBG17' in col]

math_interest_cols = [col for col in df_4.columns if 'SBM02' in col]
math_teacher_cols = [col for col in df_4.columns if 'SBM03' in col]
math_perception_cols = [col for col in df_4.columns if 'SBM05' in col]

science_interest_cols = [col for col in df_4.columns if 'SBS07' in col]
science_teacher_cols = [col for col in df_4.columns if 'SBS08' in col]
science_perception_cols = [col for col in df_4.columns if 'SBS09' in col]

belong_cols = [col for col in df_4.columns if 'SBG10' in col]
bully_cols = [col for col in df_4.columns if 'SBG11' in col]

teacher_support_cols = ['ACBG14A', 'ACBG14B', 'ACBG14C', 'ACBG14D']
parents_support_cols = ['ACBG14E', 'ACBG14F', 'ACBG14G', 'ACBG14H']
student_motivation_cols = ['ACBG14I', 'ACBG14J', 'ACBG14K']

In [9]:
pca_columns = math_interest_cols + math_teacher_cols + math_perception_cols + \
              science_interest_cols + science_teacher_cols + science_perception_cols + \
              belong_cols + bully_cols + shortage_cols + school_problem_cols + \
              primary_compedency_cols + teacher_support_cols + parents_support_cols + \
              student_motivation_cols

In [10]:
id_columns = ['IDCNTRY', 'IDSCHOOL', 'IDCLASS', 'IDSTUD']

In [11]:
school_general = ['ACBG03A', 'ACBG05B', 'ACBG06A', 'ACBG06B', 'ACBG06C', 'ACBG07', 'ACBG08A', 'ACBG10B', 'ACBG12']

In [12]:
student_general = ['ASBG01', 'ASBG03', 'ASBG04', 'ASBG05A', 'ASBG05B', 'ASBG05C', 'ASBG05D',
                   'ASBG09A', 'ASBG09B', 'ASBM01', 'ASBS06', ]

In [13]:
score_columns = ['ASMMAT01', 'ASSSCI01']

In [14]:
all_columns = id_columns + student_general  + school_general + pca_columns + score_columns

In [15]:
df_4 = df_4[all_columns]

In [16]:
# Drop rows if any of the pca columns is with missing values
df_4 = df_4.dropna(axis=0, subset=pca_columns, how='any')

In [17]:
# Drop rows if the two main score columns is with missing values
df_4 = df_4.dropna(axis=0, subset=score_columns, how='any')

## Fix Column Values for Principal Component Analysis

In [18]:
# Change column names and map values to match with the direction of the responses of the rest
# of the values in the same group (change negative questions to positive ones, and reverse the
# values of the answers)

def map_values(col_name):
    df_4[col_name] = df_4[col_name].map({1: 4, 2: 3, 3: 2, 4:1})
    return df_4[col_name]

In [19]:
# Identify columns to fix
cols_to_fix = ['ASBM02B', 'ASBM02C', 'ASBM05B', 'ASBM05C', 'ASBM05E', 'ASBM05H',
               'ASBM05I', 'ASBS07B', 'ASBS07C', 'ASBS09B', 'ASBS09C', 'ASBS09F', 
               'ASBS09G']

In [20]:
# Fix columns
for col in cols_to_fix:
    map_values(col)

In [21]:
# Categorize columns for math and science for PCA
math_interest = df_4[math_interest_cols]
math_perception = df_4[math_perception_cols]
math_teacher = df_4[math_teacher_cols]
science_interest = df_4[science_interest_cols]
science_perception = df_4[science_perception_cols]
science_teacher = df_4[science_teacher_cols]

In [22]:
# Categorize columns for other measures for PCA
belong = df_4[belong_cols]
bully = df_4[bully_cols]
shortage = df_4[shortage_cols]
school_problem = df_4[school_problem_cols]
primary_compedency = df_4[primary_compedency_cols]

teacher_support = df_4[teacher_support_cols]
parents_support = df_4[parents_support_cols]
student_motivation = df_4[student_motivation_cols]

# Principal Component Analysis

In [24]:
def pca_caculation(df):
    
    #Specify columns for PCA
    X = df[df.columns.tolist()]
    
    #Scale column values
    x = StandardScaler().fit_transform(X)
    
    #PCA caculation
    pca = PCA(n_components=1)
    principal_components = pca.fit_transform(x)
    
    #Add PCA column to dataframe
    df['PCA_values'] = principal_components
    
    return df

In [25]:
#All pca dfs
pca_dfs = [math_interest, math_perception, math_teacher, science_interest, science_perception, 
           science_teacher, belong, bully, shortage, primary_compedency, school_problem,
           teacher_support, parents_support, student_motivation]

In [26]:
#Apply function to all pca dfs
for i, df in enumerate(pca_dfs):
    df = pca_caculation(df)
    df_4[i] = df['PCA_values']

In [27]:
df_4 = df_4.drop(pca_columns, axis=1)

In [28]:
df_4 = df_4.rename(columns={0: 'math_interest', 1: 'math_perception', 2: 'math_teacher', 3: 'science_interest',
                     4: 'science_perception', 5: 'science_teacher', 6: 'belong', 7: 'bully',
                     8: 'shortage', 9: 'primary_compedency', 10: 'school_problem', 11: 'teacher_support',
                     12: 'parents_support', 13: 'student_motivation'})

# Labels

In [29]:
# clean stu_con_4 columns to match with df_4
stu_con_4_columns = stu_con_4.iloc[2:, 0:2].set_index('Variable')
stu_4_labels = stu_con_4_columns.loc[stu_con_4_columns.index.intersection(df_4.columns)].dropna().reset_index()

In [30]:
# clean sch_con_4 columns to match with df_4
sch_con_4_columns = sch_con_4.iloc[5:, 0:2].set_index('Variable')
sch_4_labels = sch_con_4_columns.loc[sch_con_4_columns.index.intersection(df_4.columns)].dropna().reset_index()

In [31]:
#merge stu_con_4 and sch_con_4, and replace labels
sch_stu_4_labels = stu_4_labels.merge(sch_4_labels, how='outer')
sch_stu_4_labels['Label'] = sch_stu_4_labels['Label'].str.lower()

sch_stu_4_labels = sch_stu_4_labels.\
                            replace('gen\\\\', '', regex=True).\
                            replace('students background\\\\', '', regex=True).\
                            replace('disadva', 'disadvantage', regex=True).\
                            replace('sch character\\\\', '', regex=True).\
                            replace('tch', 'teacher', regex=True).\
                            replace('sch', 'school', regex=True).\
                            replace('std', 'student', regex=True).\
                            replace('often speak <lang of test> at home', 
                                    'how often student speak the language of the test at home', regex=True).\
                            replace('amount of books in your home', 'amount of books at home', regex=True).\
                            replace('home possess\\\\computer tablet', 'computer or tablet at home', regex=True).\
                            replace('home possess\\\\study desk', 'study desk at home', regex=True).\
                            replace('home possess\\\\own room', 'own room at home', regex=True).\
                            replace('home possess\\\\internet connection', 'internet connection at home', regex=True).\
                            replace('how often\\\\tired', 'how often student is tired', regex=True).\
                            replace('how often\\\\hungry', 'how often student is hungry', regex=True).\
                            replace('mat\\\\how often\\\\work on your own',
                                    'how often student works on his/her own in math', regex=True).\
                            replace('sci\\\\how often conduct experiments',
                                    'how often conduct experiments in science', regex=True).\
                            replace('mathematics', 'math', regex=True).\
                            replace('1st plausible value', '', regex=True).\
                            replace('1st pv', '', regex=True)

In [32]:
# update new labels to df_4
new_col_naming = dict(zip(sch_stu_4_labels['index'], sch_stu_4_labels['Label']))
df_4 = df_4.rename(columns=new_col_naming)

In [33]:
# rename country_ids to match df_4 
country_ids = country_ids.rename(columns={'country_ID':'IDCNTRY'})

In [34]:
# add country names
df_4_final = df_4.merge(country_ids)

In [35]:
# add human development reports
df_4_final = df_4_final.merge(human_development_reports_df)

In [44]:
# Export to csv
df_4_final.to_csv('grade_4_hdr.csv', index=False)

# Factor Loadings

In [37]:
def get_factor_loadings(df):
    cols = [col for col in df.columns if col != 'PCA_values']
    df = df[cols]
    x = StandardScaler().fit_transform(df)
    pca = PCA(n_components=1)
    pca.fit(x)
    loadings = pca.components_ * np.sqrt(pca.explained_variance_)
    loadings_df = pd.DataFrame({'variable': df.columns.tolist(), 'loading': loadings[0]})
    return loadings_df

In [38]:
f_loading = get_factor_loadings(math_interest)

In [39]:
tempt_df = sch_con_4_columns.reset_index()

In [40]:
col_naming = dict(zip(tempt_df['Variable'], tempt_df['Label']))

In [41]:
f_loading['variable'] = f_loading['variable'].map(col_naming)

In [42]:
f_loading

Unnamed: 0,variable,loading
0,MAT\AGREE\ENJOY LEARNING MATHEMATICS,0.857202
1,MAT\AGREE\WISH HAVE NOT TO STUDY MATH,0.598409
2,MAT\AGREE\MATH IS BORING,0.708597
3,MAT\AGREE\LEARN INTERESTING THINGS,0.720341
4,MAT\AGREE\LIKE MATHEMATICS,0.901646
5,MAT\AGREE\SCHOOLWORK INVOLVES NUMBERS,0.784083
6,MAT\AGREE\LIKE MATH PROBLEMS,0.808883
7,MAT\AGREE\LOOK FORWARD TO MATH LESSONS,0.855337
8,MAT\AGREE\MATH FAVORITE SUBJECT,0.849833


In [43]:
# f_loading.to_csv('f_loading_4_mat_int.csv', index=False)