In [113]:
# Libraries
import os
import pandas as pd
import re

In [114]:
# Get the file names in each folder

# Questionary A is for the undergrad students
file_names_undergrad = os.listdir('grad_and_undergrad/datasets/undergraduate')

# Questionary B is for the graduate students
file_names_grad = os.listdir('grad_and_undergrad/datasets/graduate')

# Questionary G is for the high school students
file_names_hs =  os.listdir('high_school/datasets')

# High School Students

**Notes**
- Files *F9117G_1920.csv*, *F9117G_2021.csv* and *F9117G_2122.csv* are pipe separated.
- The file *F9117G_2223.csv* is comma separated and needs latin encoding. 
- Files *F9117G_1920.csv* and *F9117G_2021.csv* have two additional rows: FECHA_ENTREGA and C_OBSERVACION (The last one, not reported as a column in 'Column names')
- In files *F9117G_1920.csv* and *F9117G_2021.csv*, some variables have no correspondence in the questionary. For instance: E3, E4, E19, E230, E258, E763, E766 and E769 (seems to be misspelled as there are no V3, V4, V19, V230, V258, V763, V766 and V769)
- In files  *F9117G_2122.csv* and *F9117G_2223.csv*, some variables have no correspondence in the questionary. For instance: E3, E4, E19, E763, E766, E769 (seems to be misspelled as there are no V3, V4, V19, V763, V766, V769)

In [115]:
# Initialize an empty list to store the dataframes
list_df_hs = []

# Iterate over the file names
for file in file_names_hs:
    # Read each file and add the year to a new column
    year = file.split("_")[1].split(".")[0]
    # The file for the year 22-23 is comma separated
    if year == '2223':
        df = pd.read_csv(os.path.join('high_school/datasets',file), sep=',', encoding='latin-1', low_memory=False)
        # Remove the last column 
        df = df.iloc[:, :-1]
    else:
        df = pd.read_csv(os.path.join('high_school/datasets', file), sep="|", low_memory=False)
    
    # Modify columns names for consitency
    for var in df.columns:
        if re.match(r'^E\d', var):
            new_name = re.sub(r'^E', 'V', var)
            df.rename(columns={var: new_name}, inplace=True)
            
    # Append the dataframe to the list
    list_df_hs.append(df)

In [116]:
# Check for variable consistency

# Create a dictionary to store the column names for each dataframe
df_names_hs = {}

for i in range(len(list_df_hs)):
    df_names_hs[file_names_hs[i]] = list(list_df_hs[i].columns)

# Create a list of unique variables 
variables_hs = set().union(*df_names_hs.values())

# Diffence between all variables and the variables in the dataframes
missing_vars_1920 = variables_hs.difference(df_names_hs['F9117G_1920.csv'])
missing_vars_2021 = variables_hs.difference(df_names_hs['F9117G_2021.csv'])
missing_vars_2122 = variables_hs.difference(df_names_hs['F9117G_2122.csv'])
missing_vars_2223 = variables_hs.difference(df_names_hs['F9117G_2223.csv'])

# Print the results
print(f"Missing variables in 2019-2020: {missing_vars_1920}")
print(f"Missing variables in 2020-2021: {missing_vars_2021}")
print(f"Missing variables in 2021-2022: {missing_vars_2122}")   
print(f"Missing variables in 2022-2023: {missing_vars_2223}")

Missing variables in 2019-2020: set()
Missing variables in 2020-2021: set()
Missing variables in 2021-2022: {'V391', 'V221', 'V195', 'V238', 'V965', 'V951', 'V955', 'V970', 'V930', 'V923', 'V230', 'V186', 'V202', 'V275', 'V335', 'V113', 'V243', 'V184', 'V189', 'V952', 'V200', 'V217', 'V242', 'V269', 'V347', 'V926', 'V934', 'SUBSISTEMA_3', 'V244', 'V259', 'V299', 'V199', 'V957', 'V398', 'V188', 'V941', 'V216', 'V927', 'V973', 'V949', 'V931', 'V953', 'V938', 'V918', 'V107', 'V946', 'V922', 'V249', 'V38', 'V913', 'V208', 'V223', 'V732', 'V341', 'V215', 'V384', 'V211', 'V252', 'V89', 'V192', 'V950', 'V185', 'V231', 'V179', 'FECHA_ENTREGA', 'V119', 'V947', 'SUBSISTEMA_1', 'V233', 'V220', 'V245', 'V948', 'V24', 'V921', 'V932', 'V323', 'V696', 'V205', 'V210', 'V222', 'V143', 'V226', 'V377', 'V257', 'V101', 'V224', 'V209', 'V971', 'V201', 'V969', 'V237', 'V219', 'V293', 'V77', 'V232', 'V305', 'V229', 'V193', 'V258', 'V972', 'V218', 'V31', 'V750', 'V194', 'V370', 'V204', 'V708', 'V256', 'V940',

**Addtional Notes:** 
- The variables *C_OBSERVACION* and *FECHA_ENTREGA* is only in files 'KI9119A_1920.csv' and 'KI9119A_2021.csv'.
- In file 'F9117G_2122.csv' some variables are missing. For example: *V24*, *V32*, *V38*, etc.
- The file 'F9117G_2223.csv' have an extra unamed empty column (the last one). 

# Undergraduate Students

In [117]:
# Initialize an empty list to store the dataframes
list_df_undergrad = []

# Iterate over the file names
for file in file_names_undergrad:
    # Read each file and add the year to a new column
    year = file.split("_")[1].split(".")[0]
    # The file for the year 22-23 is comma separated
    if year == '2223':
        df = pd.read_csv(os.path.join('grad_and_undergrad/datasets/undergraduate', file), sep=',', encoding='latin-1', low_memory=False)
    else:
        df = pd.read_csv(os.path.join('grad_and_undergrad/datasets/undergraduate', file), sep="|", low_memory=False)
            
    # Append the dataframe to the list
    list_df_undergrad.append(df)


In [120]:
# Create a dictionary to store the column names for each dataframe
df_names_undergrad = {}

for i in range(len(list_df_undergrad)):
    df_names_undergrad[file_names_undergrad[i]] = list(list_df_undergrad[i].columns)

# Create a list of unique variables 
variables_undergrad = set().union(*df_names_undergrad.values())

# Diffence between all variables and the variables in the dataframes
missing_vars_1920 = variables_undergrad.difference(df_names_undergrad['KI9119A_1920.csv'])
missing_vars_2021 = variables_undergrad.difference(df_names_undergrad['KI9119A_2021.csv'])
missing_vars_2122 = variables_undergrad.difference(df_names_undergrad['KI9119A_2122.csv'])
missing_vars_2223 = variables_undergrad.difference(df_names_undergrad['KI9119A_2223.csv'])

# Print the missing variables
print(f"Missing variables in 2019-2020: {missing_vars_1920}")
print(f"Missing variables in 2020-2021: {missing_vars_2021}")
print(f"Missing variables in 2021-2022: {missing_vars_2122}")   
print(f"Missing variables in 2022-2023: {missing_vars_2223}")


Missing variables in 2019-2020: {'V734', 'C_OBSERVACION'}
Missing variables in 2020-2021: {'F593', 'V734'}
Missing variables in 2021-2022: {'V321', 'V166', 'V330', 'V91', 'V455', 'V172', 'V336', 'V341', 'V332', 'V322', 'V86', 'V335', 'V327', 'V456', 'FECHA_ENTREGA', 'V312', 'V337', 'V319', 'V331', 'V325', 'V333', 'SUBSISTEMA_1', 'V457', 'V317', 'V334', 'V324', 'C_OBSERVACION', 'V315', 'V328', 'V311', 'SUBSISTEMA_3', 'V24', 'V316', 'V323', 'V318', 'V320', 'V326', 'F593', 'V160', 'SUBSISTEMA_2', 'V329', 'V734', 'V454', 'V154', 'V29', 'V178', 'V142', 'V314', 'V148', 'V313'}
Missing variables in 2022-2023: {'V456', 'V454', 'FECHA_ENTREGA', 'V455', 'V2', 'V457', 'F593', 'C_OBSERVACION'}


**Additional Notes**
- The variable *F593* is only in file 'KI9119A_1920.csv'.
- The variable *C_OBSERVACION* is only in file 'KI9119A_2021.csv'.
- The variable *FECHA_ENTREGA* is only in files 'KI9119A_1920.csv' and 'KI9119A_2021.csv'.
- The file 'KI9119A_2122.csv' have multiple missing variables. For instance: *SUBSISTEMA_1*, *SUBSISTEMA_1*, *SUBSISTEMA_1*, etc.
- The variable *V734* is only in file 'KI9119A_2223.csv'.
- The file 'KI9119A_2223.csv' have multiple missing variables. For instance: *V2*, *V454*, *V455*, etc.

In [119]:
file_names_undergrad

['KI9119A_2122.csv',
 'KI9119A_2021.csv',
 'KI9119A_2223.csv',
 'KI9119A_1920.csv']

Graduate Students