# Data Challenge 2024 - EY

## Data Cleaning

In [1]:
# Libraries
import os
import pandas as pd
import re
import numpy as np

In [2]:
# Get the file names in each folder

# Questionary A is for the undergrad students
file_names_undergrad = os.listdir('grad_and_undergrad/datasets/undergraduate')

# Questionary B is for the graduate students
file_names_grad = os.listdir('grad_and_undergrad/datasets/graduate')

# Questionary G is for the high school students
file_names_hs =  os.listdir('high_school/datasets')

### High School Students

**Notes:**
- Files 'F9117G_1920.csv', 'F9117G_2021.csv' and 'F9117G_2122.csv' are pipe separated.
- The file 'F9117G_2223.csv' is comma separated and needs latin encoding. 
- In files 'F9117G_1920.csv' and 'F9117G_2021.csv', some variables have no correspondence in the questionary. For instance: E3, E4, E19, E230, E258, E763, E766 and E769 (seems to be misspelled as there are no V3, V4, V19, V230, V258, V763, V766 and V769)
- In files  'F9117G_2122.csv' and 'F9117G_2223.csv', some variables have no correspondence in the questionary. For instance: E3, E4, E19, E763, E766, E769 (seems to be misspelled as there are no V3, V4, V19, V763, V766, V769)

In [3]:
# Initialize an empty list to store the dataframes
list_df_hs = []

# Iterate over the file names
for file in file_names_hs:
    # Read each file and add the year to a new column
    year = file.split("_")[1].split(".")[0]
    # The file for the year 22-23 is comma separated
    if year == '2223':
        df = pd.read_csv(os.path.join('high_school/datasets',file), sep=',', encoding='latin-1', low_memory=False)
        # Remove the last column 
        df = df.iloc[:, :-1]
    else:
        df = pd.read_csv(os.path.join('high_school/datasets', file), sep="|", low_memory=False)
    
    # Modify columns names for consitency
    for var in df.columns:
        if re.match(r'^E\d', var):
            new_name = re.sub(r'^E', 'V', var)
            df.rename(columns={var: new_name}, inplace=True)
    
    # New column for the academic year
    df['ACADEMIC_YEAR'] = '20'+ year[:2] + '-' '20' + year[2:]
            
    # Append the dataframe to the list
    list_df_hs.append(df)

In [4]:
# Check for variable consistency

# Create a dictionary to store the column names for each dataframe
df_names_hs = {}

for i in range(len(list_df_hs)):
    df_names_hs[file_names_hs[i]] = list(list_df_hs[i].columns)

# Create a list of unique variables 
variables_hs = set().union(*df_names_hs.values())

# Diffence between all variables and the variables in the dataframes
missing_vars_1920 = variables_hs.difference(df_names_hs['F9117G_1920.csv'])
missing_vars_2021 = variables_hs.difference(df_names_hs['F9117G_2021.csv'])
missing_vars_2122 = variables_hs.difference(df_names_hs['F9117G_2122.csv'])
missing_vars_2223 = variables_hs.difference(df_names_hs['F9117G_2223.csv'])

# Print the results
print(f"Missing variables in 2019-2020: {missing_vars_1920}")
print(f"Missing variables in 2020-2021: {missing_vars_2021}")
print(f"Missing variables in 2021-2022: {missing_vars_2122}")   
print(f"Missing variables in 2022-2023: {missing_vars_2223}")

Missing variables in 2019-2020: set()
Missing variables in 2020-2021: set()
Missing variables in 2021-2022: {'V45', 'V258', 'V200', 'V971', 'V189', 'V341', 'V256', 'V95', 'V251', 'V953', 'SUBSISTEMA_2', 'V193', 'V173', 'V260', 'V937', 'V65', 'V945', 'V213', 'V235', 'V927', 'V936', 'V955', 'V214', 'V726', 'V247', 'V942', 'V250', 'V973', 'SUBSISTEMA_1', 'V398', 'V119', 'V38', 'V186', 'V202', 'V916', 'V957', 'V201', 'V237', 'V956', 'V71', 'V231', 'V253', 'V31', 'V384', 'V944', 'V914', 'V972', 'V107', 'V249', 'V954', 'V708', 'V207', 'V195', 'V211', 'V244', 'V919', 'V940', 'V377', 'V921', 'V215', 'V225', 'V192', 'V83', 'V370', 'V918', 'V939', 'V929', 'V238', 'V275', 'V234', 'V222', 'V182', 'V329', 'V933', 'V143', 'V212', 'V203', 'V248', 'V204', 'V191', 'V59', 'V242', 'V934', 'V205', 'V317', 'V931', 'V293', 'V221', 'V184', 'V224', 'V952', 'V241', 'V926', 'V941', 'V190', 'V966', 'V970', 'V228', 'V194', 'V218', 'FECHA_ENTREGA', 'V230', 'V209', 'V738', 'V167', 'V198', 'V89', 'V208', 'V246', 'V9

**Addtional Notes:** 
- Files *F9117G_1920.csv* and *F9117G_2021.csv* have two additional rows: FECHA_ENTREGA and C_OBSERVACION (The last one, not reported as a column in 'Column names')
- In file 'F9117G_2122.csv' some variables are missing. For example: *V24*, *V32*, *V38*, etc.
- The file 'F9117G_2223.csv' have an extra unamed empty column (the last one). 

In [5]:
# Complete dataset
high_school = pd.concat(list_df_hs)

### Undergraduate Students

**Notes:**
- Files 'KI9119A_1920.csv', 'KI9119A_2021.csv' and 'KI9119A_2122.csv' are pipe separated.
- The file 'KI9119A_2223.csv' is comma separated and needs latin encoding. 

In [6]:
# Initialize an empty list to store the dataframes
list_df_undergrad = []

# Iterate over the file names
for file in file_names_undergrad:
    # Read each file and add the year to a new column
    year = file.split("_")[1].split(".")[0]
    # The file for the year 22-23 is comma separated
    if year == '2223':
        df = pd.read_csv(os.path.join('grad_and_undergrad/datasets/undergraduate', file), sep=',', encoding='latin-1', low_memory=False)
    else:
        df = pd.read_csv(os.path.join('grad_and_undergrad/datasets/undergraduate', file), sep="|", low_memory=False)

    # New column for the academic year
    df['ACADEMIC_YEAR'] = '20'+ year[:2] + '-' '20' + year[2:]

    # Append the dataframe to the list
    list_df_undergrad.append(df)


In [7]:
# Create a dictionary to store the column names for each dataframe
df_names_undergrad = {}

for i in range(len(list_df_undergrad)):
    df_names_undergrad[file_names_undergrad[i]] = list(list_df_undergrad[i].columns)

# Create a list of unique variables 
variables_undergrad = set().union(*df_names_undergrad.values())

# Diffence between all variables and the variables in the dataframes
missing_vars_1920 = variables_undergrad.difference(df_names_undergrad['KI9119A_1920.csv'])
missing_vars_2021 = variables_undergrad.difference(df_names_undergrad['KI9119A_2021.csv'])
missing_vars_2122 = variables_undergrad.difference(df_names_undergrad['KI9119A_2122.csv'])
missing_vars_2223 = variables_undergrad.difference(df_names_undergrad['KI9119A_2223.csv'])

# Print the missing variables
print(f"Missing variables in 2019-2020: {missing_vars_1920}")
print(f"Missing variables in 2020-2021: {missing_vars_2021}")
print(f"Missing variables in 2021-2022: {missing_vars_2122}")   
print(f"Missing variables in 2022-2023: {missing_vars_2223}")


Missing variables in 2019-2020: {'C_OBSERVACION', 'V734'}
Missing variables in 2020-2021: {'F593', 'V734'}
Missing variables in 2021-2022: {'V91', 'V320', 'V328', 'V455', 'V341', 'V315', 'V337', 'FECHA_ENTREGA', 'V313', 'V321', 'V318', 'SUBSISTEMA_2', 'V166', 'V148', 'V312', 'V322', 'V336', 'V331', 'V454', 'F593', 'SUBSISTEMA_3', 'C_OBSERVACION', 'V178', 'V335', 'V324', 'V86', 'V333', 'SUBSISTEMA_1', 'V172', 'V329', 'V457', 'V154', 'V330', 'V24', 'V334', 'V332', 'V142', 'V29', 'V323', 'V326', 'V314', 'V317', 'V325', 'V456', 'V311', 'V734', 'V316', 'V319', 'V160', 'V327'}
Missing variables in 2022-2023: {'V457', 'V456', 'V454', 'F593', 'V455', 'C_OBSERVACION', 'V2', 'FECHA_ENTREGA'}


**Additional Notes**
- The variable *F593* is only in file 'KI9119A_1920.csv'.
- The variable *C_OBSERVACION* is only in file 'KI9119A_2021.csv'.
- The variable *FECHA_ENTREGA* is only in files 'KI9119A_1920.csv' and 'KI9119A_2021.csv'.
- The file 'KI9119A_2122.csv' have multiple missing variables. For instance: *SUBSISTEMA_1*, *SUBSISTEMA_1*, *SUBSISTEMA_1*, etc.
- The variable *V734* is only in file 'KI9119A_2223.csv'.
- The file 'KI9119A_2223.csv' have multiple missing variables. For instance: *V2*, *V454*, *V455*, etc.

In [8]:
# Complete dataset
undergraduate = pd.concat(list_df_undergrad)

### Graduate Students

**Notes:**
- Files 'KI9119B_1920.csv', 'KI9119B_2021.csv' and 'KI9119B_2122.csv' are pipe separated.
- The file 'KI9119B_2223.csv' is comma separated and needs latin encoding. 

In [9]:
# Initialize an empty list to store the dataframes
list_df_grad = []

# Iterate over the file names
for file in file_names_grad:
    # Read each file and add the year to a new column
    year = file.split("_")[1].split(".")[0]
    # The file for the year 22-23 is comma separated
    if year == '2223':
        df = pd.read_csv(os.path.join('grad_and_undergrad/datasets/graduate', file), sep=',', encoding='latin-1', low_memory=False)
    else:
        df = pd.read_csv(os.path.join('grad_and_undergrad/datasets/graduate', file), sep="|", low_memory=False)
   
    # New column for the academic year
    df['ACADEMIC_YEAR'] = '20'+ year[:2] + '-' '20' + year[2:]
    
    # Append the dataframe to the list
    list_df_grad.append(df)

In [10]:
# Create a dictionary to store the column names for each dataframe
df_names_grad = {}

for i in range(len(list_df_grad)):
    df_names_grad[file_names_grad[i]] = list(list_df_grad[i].columns)

# Create a list of unique variables 
variables_grad = set().union(*df_names_grad.values())

# Difference between all variables and the variables in the dataframes
missing_vars_1920 = variables_grad.difference(df_names_grad['KI9119B_1920.csv'])
missing_vars_2021 = variables_grad.difference(df_names_grad['KI9119B_2021.csv'])
missing_vars_2122 = variables_grad.difference(df_names_grad['KI9119B_2122.csv'])
missing_vars_2223 = variables_grad.difference(df_names_grad['KI9119B_2223.csv'])

# Print the missing variables
print(f"Missing variables in 2019-2020: {missing_vars_1920}")
print(f"Missing variables in 2020-2021: {missing_vars_2021}")
print(f"Missing variables in 2021-2022: {missing_vars_2122}")   
print(f"Missing variables in 2022-2023: {missing_vars_2223}")


Missing variables in 2019-2020: {'C_OBSERVACION', 'V276'}
Missing variables in 2020-2021: {'F322', 'V276'}
Missing variables in 2021-2022: {'V165', 'V50', 'V157', 'V159', 'V207', 'V162', 'V164', 'V276', 'V169', 'V161', 'FECHA_ENTREGA', 'V167', 'V28', 'V176', 'SUBSISTEMA_2', 'V125', 'V166', 'V173', 'V158', 'V168', 'V197', 'V270', 'C_OBSERVACION', 'SUBSISTEMA_3', 'V179', 'V180', 'V178', 'V192', 'F322', 'V183', 'V170', 'V187', 'SUBSISTEMA_1', 'V172', 'V42', 'V222', 'V182', 'V119', 'V33', 'V202', 'V232', 'V177', 'V171', 'V268', 'V175', 'V113', 'V237', 'V143', 'V212', 'V174', 'V181', 'V271', 'V242', 'V257', 'V217', 'V163', 'V131', 'V269', 'V137', 'V227', 'V160'}
Missing variables in 2022-2023: {'V268', 'V271', 'F322', 'FECHA_ENTREGA', 'V269', 'V270', 'C_OBSERVACION'}


**Aditional Notes**
- The variable *F322* is only in file 'KI9119B_1920.csv'.
- The variable *C_OBSERVACION* is only in file 'KI9119B_2021.csv'.
- The variable *FECHA_ENTREGA* is only in files 'KI9119B_1920.csv' and 'KI9119B_2021.csv'.
- The file 'KI9119B_2122.csv' have multiple missing variables. For instance: *SUBSISTEMA_1*, *SUBSISTEMA_1*, *SUBSISTEMA_1*, etc.
- The variable *V276* is only in file 'KI9119B_2223.csv'.
- The file 'KI9119B_2223.csv' have multiple missing variables. For instance: *V269*, *V270*, *V271*, etc.

In [11]:
# Complete dataset
graduate = pd.concat(list_df_grad)

## Exploratory Analysis

**Variable selection**

Institution ID: '14MMS0519C' for high school and '14MSU0044P' for undergraduate and graduate studies.

This selection was made based on the requirements of question 1.
 - High School: 'ACADEMIC_YEAR', 'CCT_INS_PLA', 'NOMBRE_INS_PLA', 'C_MODALIDAD', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V49', 'V50', 'V51', 'V265', 'V268', 'V958', 'V959', 'V960', 'V968', 'V969', 'V397', 'V878', 'V879'.
 - Undergraduate: 'ACADEMIC_YEAR', 'CCT_INS_PLA', 'NOMBRE_INS_PLA', 'NOMBRECT', 'C_NOM_CARRERA','C_MODALIDAD', 'V1', 'V2', 'V3', 'V4', 'V11', 'V12', 'V13', 'V14', 'V15', 'V60', 'V61', 'V82', 'V85', 'V175', 'V176', 'V177', 'V454', 'V455', 'V456', 'V458', 'V495'
 - Graduate: 'ACADEMIC_YEAR', 'CCT_INS_PLA', 'NOMBRE_INS_PLA', 'NOMBRECT', 'C_NOM_CARRERA', 'C_MODALIDAD', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V38', 'V41', 'V140','V141', 'V142', 'V156', 'V268', 'V269', 'V270', 'V271', 'V274'


In [12]:
# Filtered datasets

var_hs_profile = ['ACADEMIC_YEAR', 'CCT_INS_PLA', 'NOMBRE_INS_PLA', 'C_MODALIDAD', 'V13', 'V14', 'V15', 'V16', 
                  'V17', 'V18', 'V19', 'V49', 'V50', 'V51', 'V265', 'V268', 'V958', 'V959', 
                  'V960', 'V968', 'V969', 'V397', 'V878', 'V879']

var_undergrad_profile = ['ACADEMIC_YEAR', 'CCT_INS_PLA', 'NOMBRE_INS_PLA', 'NOMBRECT', 'C_NOM_CARRERA', 
                         'C_MODALIDAD', 'V1', 'V2', 'V3', 'V4', 'V11', 'V12', 'V13', 'V14', 'V15', 'V60',  
                         'V61', 'V82', 'V85', 'V175', 'V176', 'V177', 'V454', 'V455', 'V456', 'V458', 'V495']

var_grad_profile = ['ACADEMIC_YEAR', 'CCT_INS_PLA', 'NOMBRE_INS_PLA', 'NOMBRECT', 'C_NOM_CARRERA', 
                    'C_MODALIDAD', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V38', 'V41', 'V140',  
                    'V141', 'V142', 'V156', 'V268', 'V269', 'V270', 'V271', 'V274']


# Filter and select variables for High School dataset
high_school_profile = high_school[high_school['CCT_INS_PLA'] == '14MMS0519C'][var_hs_profile]

# Filter and select variables for Undergraduate dataset
undergraduate_profile = undergraduate[undergraduate['CCT_INS_PLA'] == '14MSU0044P'][var_undergrad_profile]

# Filter and select variables for Graduate dataset
graduate_profile = graduate[graduate['CCT_INS_PLA'] == '14MSU0044P'][var_grad_profile]


### Number of enrolled students

#### High School

In [13]:
# High School: Here we have serveral options, V51, V397 and V878 + V879. 
high_school_profile[['ACADEMIC_YEAR', 'C_MODALIDAD', 'V51', 'V397', 'V878', 'V879']]


Unnamed: 0,ACADEMIC_YEAR,C_MODALIDAD,V51,V397,V878,V879
7410,2021-2022,ESCOLARIZADA,0.0,192.0,90.0,102.0
14858,2019-2020,MIXTA,69.0,69.0,33.0,36.0
6671,2022-2023,ESCOLARIZADA,177.0,394.0,185.0,209.0
6844,2020-2021,MIXTA,0.0,0.0,0.0,0.0


In [14]:
# V51 differs form the other options and is incomplete. I will keep V397 as is the same as V878 + V879.
n_hs_students = high_school_profile[['ACADEMIC_YEAR', 'C_MODALIDAD', 'V397']].groupby(['ACADEMIC_YEAR', 'C_MODALIDAD']).agg({'V397': 'sum'}).reset_index()
n_hs_students.rename(columns={'V397': 'N_STUDENTS'}, inplace=True)
n_hs_students

Unnamed: 0,ACADEMIC_YEAR,C_MODALIDAD,N_STUDENTS
0,2019-2020,MIXTA,69.0
1,2020-2021,MIXTA,0.0
2,2021-2022,ESCOLARIZADA,192.0
3,2022-2023,ESCOLARIZADA,394.0


#### Undergraduate

In [15]:
# Undergraduate: Here we have V177 or V495. 
undergraduate_profile[['ACADEMIC_YEAR', 'C_MODALIDAD', 'V177', 'V495']]

Unnamed: 0,ACADEMIC_YEAR,C_MODALIDAD,V177,V495
10809,2021-2022,MIXTA,97,97
10810,2021-2022,ESCOLARIZADA,362,362
10811,2021-2022,ESCOLARIZADA,2,2
10812,2021-2022,ESCOLARIZADA,191,191
10813,2021-2022,ESCOLARIZADA,108,108
...,...,...,...,...
25258,2019-2020,ESCOLARIZADA,550,550
25259,2019-2020,ESCOLARIZADA,145,145
25260,2019-2020,ESCOLARIZADA,949,949
25261,2019-2020,ESCOLARIZADA,368,368


In [16]:
# I will keep V177 as both have same information.
n_undergrad_students = undergraduate_profile[['ACADEMIC_YEAR', 'C_MODALIDAD', 'V177']].groupby(['ACADEMIC_YEAR', 'C_MODALIDAD']).agg({'V177': 'sum'}).reset_index()
n_undergrad_students.rename(columns={'V177': 'N_STUDENTS'}, inplace=True)
n_undergrad_students


Unnamed: 0,ACADEMIC_YEAR,C_MODALIDAD,N_STUDENTS
0,2019-2020,ESCOLARIZADA,8440
1,2019-2020,MIXTA,1622
2,2020-2021,ESCOLARIZADA,8421
3,2020-2021,MIXTA,1478
4,2021-2022,ESCOLARIZADA,8312
5,2021-2022,MIXTA,1479
6,2022-2023,ESCOLARIZADA,8646
7,2022-2023,MIXTA,1474


#### Graduates

In [17]:
# Graduate: Here we have V142 or V156. 
graduate_profile[['ACADEMIC_YEAR', 'C_MODALIDAD', 'V142', 'V156']]

Unnamed: 0,ACADEMIC_YEAR,C_MODALIDAD,V142,V156
7487,2021-2022,MIXTA,2,2
7488,2021-2022,MIXTA,0,0
7489,2021-2022,MIXTA,14,14
7490,2021-2022,ESCOLARIZADA,39,39
7491,2021-2022,ESCOLARIZADA,37,37
...,...,...,...,...
12153,2022-2023,ESCOLARIZADA,14,14
12154,2022-2023,ESCOLARIZADA,15,15
12155,2022-2023,ESCOLARIZADA,15,15
14372,2022-2023,ESCOLARIZADA,7,7


In [18]:
# I will keep V142 as both have the same information.
n_grad_students = graduate_profile[['ACADEMIC_YEAR', 'C_MODALIDAD', 'V142']].groupby(['ACADEMIC_YEAR', 'C_MODALIDAD']).agg({'V142': 'sum'}).reset_index()
n_grad_students.rename(columns={'V142': 'N_STUDENTS'}, inplace=True)
n_grad_students


Unnamed: 0,ACADEMIC_YEAR,C_MODALIDAD,N_STUDENTS
0,2019-2020,ESCOLARIZADA,773
1,2019-2020,MIXTA,93
2,2019-2020,NO ESCOLARIZADA,0
3,2020-2021,ESCOLARIZADA,748
4,2020-2021,MIXTA,65
5,2020-2021,NO ESCOLARIZADA,0
6,2021-2022,ESCOLARIZADA,727
7,2021-2022,MIXTA,27
8,2022-2023,ESCOLARIZADA,686
9,2022-2023,MIXTA,18


### Most significant degrees or faculties

#### Undergraduate

In [19]:
# Number of students by faculty
ugrad_nstud_by_depto = undergraduate_profile.groupby(['ACADEMIC_YEAR', 'NOMBRECT']).agg({'V177': 'sum'}).reset_index()
ugrad_nstud_by_depto['NOMBRECT'] = ugrad_nstud_by_depto['NOMBRECT'].str.replace('DEPARTAMENTO DE ', '').str.replace('DEPTO DE ', '')
ugrad_nstud_by_depto.rename(columns={'NOMBRECT': 'DEPTO', 'V177': 'N_STUDENTS'}, inplace=True)
ugrad_nstud_by_depto

Unnamed: 0,ACADEMIC_YEAR,DEPTO,N_STUDENTS
0,2019-2020,ECONOMIA ADMINISTRACION Y MERCADOLOGIA,2743
1,2019-2020,ESTUDIOS SOCIOPOLITICOS Y JURIDICOS,782
2,2019-2020,MATEMATICAS Y FISICA,522
3,2019-2020,PROCESOS TECNOLOGICOS E INDUSTRIALES,1382
4,2019-2020,PSICOLOGIA EDUCACION Y SALUD,747
5,2019-2020,ELECTRONICA SISTEMAS E INFORMATICA,733
6,2019-2020,ESTUDIOS SOCIO CULTURALES,1059
7,2019-2020,FILOSOFIA Y HUMANIDADES,66
8,2019-2020,HABITAT Y DESARROLLO URBANO,2028
9,2020-2021,ECONOMIA ADMINISTRACION Y MERCADOLOGIA,2618


In [20]:
# Number of students by degree
ugrad_nstud_by_degree = undergraduate_profile.groupby(['ACADEMIC_YEAR', 'C_NOM_CARRERA']).agg({'V177': 'sum'}).reset_index()
ugrad_nstud_by_degree.rename(columns={'V177': 'N_STUDENTS'}, inplace=True)
ugrad_nstud_by_degree

Unnamed: 0,ACADEMIC_YEAR,C_NOM_CARRERA,N_STUDENTS
0,2019-2020,ARQUITECTO,949
1,2019-2020,INGENIERIA EN SEGURIDAD INFORMATICA Y REDES,90
2,2019-2020,INGENIERÍA AMBIENTAL,146
3,2019-2020,INGENIERÍA CIVIL,368
4,2019-2020,INGENIERÍA EN ALIMENTOS,76
...,...,...,...
166,2022-2023,LICENCIATURA EN PSICOLOGÍA,550
167,2022-2023,LICENCIATURA EN PUBLICIDAD Y COMUNICACIÓN ESTR...,186
168,2022-2023,LICENCIATURA EN RECURSOS HUMANOS Y TALENTO ORG...,44
169,2022-2023,LICENCIATURA EN RELACIONES INDUSTRIALES,40


#### Graduate

In [21]:
# Number of students by faculty
grad_nstud_by_depto = graduate_profile.groupby(['ACADEMIC_YEAR', 'NOMBRECT']).agg({'V142': 'sum'}).reset_index()
grad_nstud_by_depto['NOMBRECT'] = grad_nstud_by_depto['NOMBRECT'].str.replace('DEPARTAMENTO DE ', '').str.replace('DEPTO DE ', '')
grad_nstud_by_depto.rename(columns={'NOMBRECT': 'DEPTO', 'V142': 'N_STUDENTS'}, inplace=True)
grad_nstud_by_depto

Unnamed: 0,ACADEMIC_YEAR,DEPTO,N_STUDENTS
0,2019-2020,ECONOMIA ADMINISTRACION Y MERCADOLOGIA,240
1,2019-2020,ESTUDIOS SOCIOPOLITICOS Y JURIDICOS,90
2,2019-2020,MATEMATICAS Y FISICA,20
3,2019-2020,PROCESOS TECNOLOGICOS E INDUSTRIALES,53
4,2019-2020,PSICOLOGIA EDUCACION Y SALUD,177
5,2019-2020,ELECTRONICA SISTEMAS E INFORMATICA,197
6,2019-2020,ESTUDIOS SOCIO CULTURALES,16
7,2019-2020,FILOSOFIA Y HUMANIDADES,34
8,2019-2020,HABITAT Y DESARROLLO URBANO,39
9,2020-2021,CENTRO DE LENGUAS,0


In [22]:
# Number of students by degree
grad_nstud_by_degree = graduate_profile.groupby(['ACADEMIC_YEAR', 'C_NOM_CARRERA']).agg({'V142': 'sum'}).reset_index()
grad_nstud_by_degree.rename(columns={'V142': 'N_STUDENTS'}, inplace=True)
grad_nstud_by_degree

Unnamed: 0,ACADEMIC_YEAR,C_NOM_CARRERA,N_STUDENTS
0,2019-2020,DOCTORADO EN CIENCIAS DE LA INGENIERÍA,23
1,2019-2020,DOCTORADO EN ESTUDIOS CIENTÍFICOS - SOCIALES,14
2,2019-2020,DOCTORADO EN INVESTIGACIÓN PSICOLÓGICA,10
3,2019-2020,DOCTORADO INTERINSTITUCIONAL EN EDUCACIÓN,13
4,2019-2020,DOCTORADO INTERNACIONAL EN BIENESTAR SOCIAL,0
...,...,...,...
121,2022-2023,MAESTRÍA EN POLÍTICA Y ANALÍTICA PÚBLICAS,9
122,2022-2023,MAESTRÍA EN POLÍTICA Y GESTIÓN PÚBLICA,2
123,2022-2023,MAESTRÍA EN PROYECTOS Y EDIFICACIÓN SUSTENTABLE,15
124,2022-2023,MAESTRÍA EN PSICOTERAPIA,64


### Cost of attendance (tuition + enrollment fees)

#### High School

Note: 
- *V968* and *V969* not available for 2021-2022

In [23]:
# Tuiton fees
high_school_profile[['ACADEMIC_YEAR', 'V968', 'V969']]

Unnamed: 0,ACADEMIC_YEAR,V968,V969
7410,2021-2022,,
14858,2019-2020,1000.0,9000.0
6671,2022-2023,0.0,0.0
6844,2020-2021,0.0,0.0


#### Undergraduate

Note:
- *V455* and *V456* not available for 2021-2022 and 2022-2023.

In [42]:
# Filter the years 2019-2020 and 2020-2021
undergrad_tuition = undergraduate_profile[~undergraduate_profile['ACADEMIC_YEAR'].isin(['2021-2022', '2022-2023'])][['ACADEMIC_YEAR', 'NOMBRECT', 'V455', 'V456']]
# Tuition fees = Enrollment + Tuition
undergrad_tuition['TUITION_FEES'] = undergrad_tuition['V455'] + undergrad_tuition['V456']
# Remove the rows with 0 tuition fees
undergrad_tuition = undergrad_tuition[undergrad_tuition['TUITION_FEES'] > 0]
# Group by academic year and faculty
undergrad_tuition = undergrad_tuition.groupby(['ACADEMIC_YEAR', 'NOMBRECT']).agg(AVG_TUITION=('TUITION_FEES', 'mean')).reset_index()
# Rename the columns
undergrad_tuition['NOMBRECT'] = undergrad_tuition['NOMBRECT'].str.replace('DEPARTAMENTO DE ', '').str.replace('DEPTO DE ', '')
undergrad_tuition.rename(columns={'NOMBRECT': 'DEPTO'}, inplace=True)
undergrad_tuition

Unnamed: 0,ACADEMIC_YEAR,DEPTO,AVG_TUITION
0,2019-2020,ECONOMIA ADMINISTRACION Y MERCADOLOGIA,181622.714286
1,2019-2020,ESTUDIOS SOCIOPOLITICOS Y JURIDICOS,192373.75
2,2019-2020,MATEMATICAS Y FISICA,191477.0
3,2019-2020,PROCESOS TECNOLOGICOS E INDUSTRIALES,191626.5
4,2019-2020,PSICOLOGIA EDUCACION Y SALUD,176248.666667
5,2019-2020,ELECTRONICA SISTEMAS E INFORMATICA,192507.8
6,2019-2020,ESTUDIOS SOCIO CULTURALES,179868.2
7,2019-2020,FILOSOFIA Y HUMANIDADES,159205.0
8,2019-2020,HABITAT Y DESARROLLO URBANO,188070.4
9,2020-2021,ECONOMIA ADMINISTRACION Y MERCADOLOGIA,186501.75


#### Graduate

Note:
- *V269* and *V270* not available for 2021-2022 and 2022-2023.

In [44]:
# Filter the years 2019-2020 and 2020-2021
grad_tuition = graduate_profile[~graduate_profile['ACADEMIC_YEAR'].isin(['2021-2022', '2022-2023'])][['ACADEMIC_YEAR', 'NOMBRECT', 'V269', 'V270']]
# Tuition fees = Enrollment + Tuition
grad_tuition['TUITION_FEES'] = grad_tuition['V269'] + grad_tuition['V270']
# Remove the rows with 0 tuition fees
grad_tuition = grad_tuition[grad_tuition['TUITION_FEES'] > 0]
# Group by academic year and faculty
grad_tuition = grad_tuition.groupby(['ACADEMIC_YEAR', 'NOMBRECT']).agg(AVG_TUITION=('TUITION_FEES', 'mean')).reset_index()
# Rename the columns
grad_tuition['NOMBRECT'] = grad_tuition['NOMBRECT'].str.replace('DEPARTAMENTO DE ', '').str.replace('DEPTO DE ', '')
grad_tuition.rename(columns={'NOMBRECT': 'DEPTO'}, inplace=True)
grad_tuition


Unnamed: 0,ACADEMIC_YEAR,DEPTO,AVG_TUITION
0,2019-2020,ECONOMIA ADMINISTRACION Y MERCADOLOGIA,135218.0
1,2019-2020,ESTUDIOS SOCIOPOLITICOS Y JURIDICOS,166891.0
2,2019-2020,MATEMATICAS Y FISICA,145476.0
3,2019-2020,PROCESOS TECNOLOGICOS E INDUSTRIALES,142678.5
4,2019-2020,PSICOLOGIA EDUCACION Y SALUD,144184.833333
5,2019-2020,ELECTRONICA SISTEMAS E INFORMATICA,123650.285714
6,2019-2020,ESTUDIOS SOCIO CULTURALES,162262.0
7,2019-2020,FILOSOFIA Y HUMANIDADES,128839.0
8,2019-2020,HABITAT Y DESARROLLO URBANO,131488.0
9,2020-2021,ECONOMIA ADMINISTRACION Y MERCADOLOGIA,144493.666667


### Anual growth rate

The historical annual growth from income derived from tuition fees is not calculable for the entire period due to insufficient information, particularly during the academic years 2021-2022 and 2022-2023. Therefore, an alternative approach is adopted where the historical annual growth is computed from the number of students.

CAGR = ((ending value/begining value)**(1/t))-1

In [49]:
# High School
hs_year_stu = n_hs_students.groupby('ACADEMIC_YEAR').agg({'N_STUDENTS': 'sum'}).reset_index()
hs_cagr = ((hs_year_stu['N_STUDENTS'][3]/hs_year_stu['N_STUDENTS'][0])**(1/4))-1
print(f"The CAGR for the high school students is {hs_cagr:.2%}")

The CAGR for the high school students is 54.58%


In [50]:
# Undergraduate
ugrad_year_stu = n_undergrad_students.groupby('ACADEMIC_YEAR').agg({'N_STUDENTS': 'sum'}).reset_index()
ugrad_cagr = ((ugrad_year_stu['N_STUDENTS'][3]/ugrad_year_stu['N_STUDENTS'][0])**(1/4))-1
print(f"The CAGR for the undergraduate students is {ugrad_cagr:.2%}")


The CAGR for the undergraduate students is 0.14%


In [53]:
# Graduate
grad_year_stu = n_grad_students.groupby('ACADEMIC_YEAR').agg({'N_STUDENTS': 'sum'}).reset_index()
grad_cagr = ((grad_year_stu['N_STUDENTS'][3]/grad_year_stu['N_STUDENTS'][0])**(1/4))-1
print(f"The CAGR for the graduate students is {grad_cagr:.2%}")


The CAGR for the graduate students is -5.05%


The expected anual growth rate will be same for the next year. Sirva de ejemplo los calculos con graduate students.

In [60]:
expected_gr = (((grad_year_stu['N_STUDENTS'][3]*(1+grad_cagr))/grad_year_stu['N_STUDENTS'][3])-1)
print(f"The expected annual growth rate for the graduate students is {expected_gr:.2%}")

The expected annual growth rate for the graduate students is -5.05%


### COVID Impact on ITESO

In [98]:
# Undergraduate student proportion by modality
total_students = n_undergrad_students.groupby(['ACADEMIC_YEAR']).agg({'N_STUDENTS': 'sum'}).reset_index()
undergrad_proportion = pd.merge(n_undergrad_students, total_students, on=['ACADEMIC_YEAR'], how='left')
undergrad_proportion['TOTAL'] = undergrad_proportion['N_STUDENTS_x']*100/undergrad_proportion['N_STUDENTS_y']
undergrad_proportion


Unnamed: 0,ACADEMIC_YEAR,C_MODALIDAD,N_STUDENTS_x,N_STUDENTS_y,TOTAL
0,2019-2020,ESCOLARIZADA,8440,10062,83.879944
1,2019-2020,MIXTA,1622,10062,16.120056
2,2020-2021,ESCOLARIZADA,8421,9899,85.069199
3,2020-2021,MIXTA,1478,9899,14.930801
4,2021-2022,ESCOLARIZADA,8312,9791,84.894291
5,2021-2022,MIXTA,1479,9791,15.105709
6,2022-2023,ESCOLARIZADA,8646,10120,85.434783
7,2022-2023,MIXTA,1474,10120,14.565217


In [108]:
undergrad_proportion = undergrad_proportion.sort_values(by=['C_MODALIDAD', 'ACADEMIC_YEAR'])
undergrad_proportion['GROWTH'] = undergrad_proportion.groupby('C_MODALIDAD')['N_STUDENTS_x'].apply(lambda x: ((x / x.shift()) - 1)*100)
undergrad_proportion

Unnamed: 0,ACADEMIC_YEAR,C_MODALIDAD,N_STUDENTS_x,N_STUDENTS_y,TOTAL,GROWTH
0,2019-2020,ESCOLARIZADA,8440,10062,83.879944,
2,2020-2021,ESCOLARIZADA,8421,9899,85.069199,-0.225118
4,2021-2022,ESCOLARIZADA,8312,9791,84.894291,-1.294383
6,2022-2023,ESCOLARIZADA,8646,10120,85.434783,4.018287
1,2019-2020,MIXTA,1622,10062,16.120056,
3,2020-2021,MIXTA,1478,9899,14.930801,-8.877928
5,2021-2022,MIXTA,1479,9791,15.105709,0.067659
7,2022-2023,MIXTA,1474,10120,14.565217,-0.338066


In [99]:
# Graduate students proportion by modality
grad_proportion = n_grad_students[n_grad_students['C_MODALIDAD'] != "NO ESCOLARIZADA"]
total_students = grad_proportion.groupby(['ACADEMIC_YEAR']).agg({'N_STUDENTS': 'sum'}).reset_index()
grad_proportion = pd.merge(grad_proportion, total_students, on=['ACADEMIC_YEAR'], how='left')
grad_proportion['TOTAL'] = grad_proportion['N_STUDENTS_x']*100/grad_proportion['N_STUDENTS_y']
grad_proportion


Unnamed: 0,ACADEMIC_YEAR,C_MODALIDAD,N_STUDENTS_x,N_STUDENTS_y,TOTAL
0,2019-2020,ESCOLARIZADA,773,866,89.26097
1,2019-2020,MIXTA,93,866,10.73903
2,2020-2021,ESCOLARIZADA,748,813,92.00492
3,2020-2021,MIXTA,65,813,7.99508
4,2021-2022,ESCOLARIZADA,727,754,96.419098
5,2021-2022,MIXTA,27,754,3.580902
6,2022-2023,ESCOLARIZADA,686,704,97.443182
7,2022-2023,MIXTA,18,704,2.556818


In [109]:
grad_proportion = grad_proportion.sort_values(by=['C_MODALIDAD', 'ACADEMIC_YEAR'])
grad_proportion['GROWTH'] = grad_proportion.groupby('C_MODALIDAD')['N_STUDENTS_x'].apply(lambda x: ((x / x.shift()) - 1)*100)
grad_proportion


Unnamed: 0,ACADEMIC_YEAR,C_MODALIDAD,N_STUDENTS_x,N_STUDENTS_y,TOTAL,GROWTH
0,2019-2020,ESCOLARIZADA,773,866,89.26097,
2,2020-2021,ESCOLARIZADA,748,813,92.00492,-3.234153
4,2021-2022,ESCOLARIZADA,727,754,96.419098,-2.807487
6,2022-2023,ESCOLARIZADA,686,704,97.443182,-5.639615
1,2019-2020,MIXTA,93,866,10.73903,
3,2020-2021,MIXTA,65,813,7.99508,-30.107527
5,2021-2022,MIXTA,27,754,3.580902,-58.461538
7,2022-2023,MIXTA,18,704,2.556818,-33.333333


### COVID Impact on pricing

In [111]:
# Undergraduate
# Group by 'ACADEMIC_YEAR' and calculate the mean of 'AVG_TUITION'
avg_tuition_ugrad = undergrad_tuition.groupby('ACADEMIC_YEAR')['AVG_TUITION'].mean().reset_index()

# Calculate the growth rate
avg_tuition_ugrad['GROWTH_RATE'] = avg_tuition_ugrad['AVG_TUITION'].pct_change() * 100

# Display the dataframe
avg_tuition_ugrad


Unnamed: 0,ACADEMIC_YEAR,AVG_TUITION,GROWTH_RATE
0,2019-2020,183666.670106,
1,2020-2021,187610.305556,2.14717


In [112]:
# Graduate
# Group by 'ACADEMIC_YEAR' and calculate the mean of 'AVG_TUITION'
avg_tuition_grad = grad_tuition.groupby('ACADEMIC_YEAR')['AVG_TUITION'].mean().reset_index()

# Calculate the growth rate
avg_tuition_grad['GROWTH_RATE'] = avg_tuition_grad['AVG_TUITION'].pct_change() * 100

# Display the dataframe
avg_tuition_grad


Unnamed: 0,ACADEMIC_YEAR,AVG_TUITION,GROWTH_RATE
0,2019-2020,142298.624339,
1,2020-2021,138327.332275,-2.790815


### Competitive landscape

When assessing the competitive landscape, one must first consider location, and in the case of ITESO, situated in Tlaquepaque within the metropolitan area of Guadalajara, it is pertinent to evaluate schools within Guadalajara and its surrounding metropolitan zone (Zapopan, Tlaquepaque and Tonalá).

Regarding high schools, the available information is limited, yet it is plausible to suggest that the market consists predominantly of private institutions within the metropolitan area of Guadalajara. High school education may present a fragmented market due to its reliance on proximity to students' residences, which differs from the dynamics of undergraduate and graduate education.

When evaluating undergraduate and graduate programs, it is essential to account for both public and private institutions, as well as various modalities such as on-site, hybrid, and online learning. Competition within this realm occurs across both private and public sectors, although not all universities are direct competitors. Among the prominent institutions in Guadalajara are:

- Universidad de Guadalajara (14MSU0010Z)
- Tecnológico de Monterrey, Campus Guadalajara (14MSU0050Z)
- Universidad Panamericana, Campus Guadalajara (14MSU0070N)
- Universidad Autónoma de Guadalajara (14MSU0028Y)
- ITESO (14MSU0044P)

This list aligns with common considerations when selecting a university, which include factors such as academic reputation, available programs, infrastructure and resources, and the cost of tuition and enrollment, including potential scholarships or financial aid.

To compare ITESO the following variables were choosen: 

 - Undergraduate: 'ACADEMIC_YEAR', 'C_NOM_MUN', 'CCT_INS_PLA', 'NOMBRE_INS_PLA', 'NOMBRECT', 'C_NOM_CARRERA','C_MODALIDAD', 'V1', 'V2', 'V3', 'V4', 'V11', 'V12', 'V13', 'V14', 'V15', 'V60', 'V61', 'V82', 'V85', 'V175', 'V176', 'V177', 'V454', 'V455', 'V456', 'V458', 'V495'
 - Graduate: 'ACADEMIC_YEAR', 'C_NOM_MUN', 'CCT_INS_PLA', 'NOMBRE_INS_PLA', 'NOMBRECT', 'C_NOM_CARRERA', 'C_MODALIDAD', 'V1', 'V2', 'V3', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V38', 'V41', 'V140','V141', 'V142', 'V156', 'V248', 'V249', 'V268', 'V269', 'V270', 'V271', 'V274'.

In [161]:
# Filter undergraduate dataset
undergrad_competitors = undergraduate[undergraduate['CCT_INS_PLA'].isin(['14MSU0010Z', '14MSU0050Z', '14MSU0070N', '14MSU0028Y', '14MSU0044P'])]
undergrad_competitors = undergrad_competitors[undergrad_competitors['CV_MUN'].isin([39, 98, 101, 120])]
undergrad_competitors = undergrad_competitors[['ACADEMIC_YEAR', 'C_NOM_MUN', 'CCT_INS_PLA', 'NOMBRE_INS_PLA', 'NOMBRECT', 'C_NOM_CARRERA', 'C_MODALIDAD', 
                                               'V1', 'V2', 'V3', 'V4', 'V11', 'V12', 'V13', 'V14', 'V15', 'V60', 'V61', 'V82', 'V85', 'V175', 
                                               'V176', 'V177', 'V454', 'V455', 'V456', 'V458', 'V495']]

# Filter graduate dataset
grad_competitors = graduate[graduate['CCT_INS_PLA'].isin(['14MSU0010Z', '14MSU0050Z', '14MSU0070N', '14MSU0028Y', '14MSU0044P'])]
grad_competitors = grad_competitors[grad_competitors['CV_MUN'].isin([39, 98, 101, 120])]
grad_competitors = grad_competitors[['ACADEMIC_YEAR', 'C_NOM_MUN', 'CCT_INS_PLA', 'NOMBRE_INS_PLA', 'NOMBRECT', 'C_NOM_CARRERA', 'C_MODALIDAD', 
                                     'V1', 'V2', 'V3', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V38', 'V41', 'V140', 'V141', 'V142', 'V156', 
                                     'V248', 'V249', 'V268', 'V269', 'V270', 'V271', 'V274']]


#### Number of students

In [147]:
# Undergraduate
n_undergrad_by_uni = undergrad_competitors[['ACADEMIC_YEAR', 'NOMBRE_INS_PLA', 'V177']].groupby(['ACADEMIC_YEAR', 'NOMBRE_INS_PLA']).agg({'V177': 'sum'}).reset_index()
n_undergrad_by_uni.rename(columns={'V177': 'N_STUDENTS'}, inplace=True)
n_undergrad_by_uni

Unnamed: 0,ACADEMIC_YEAR,NOMBRE_INS_PLA,N_STUDENTS
0,2019-2020,ITESM CAMPUS GUADALAJARA,4810
1,2019-2020,ITESO,10062
2,2019-2020,UNIVERSIDAD AUTONOMA DE GUADALAJARA,9900
3,2019-2020,UNIVERSIDAD DE GUADALAJARA,85090
4,2019-2020,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,3847
5,2020-2021,ITESM CAMPUS GUADALAJARA,4095
6,2020-2021,ITESO,9899
7,2020-2021,UNIVERSIDAD AUTONOMA DE GUADALAJARA,8968
8,2020-2021,UNIVERSIDAD DE GUADALAJARA,91222
9,2020-2021,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,3841


In [148]:
# Graduate
n_grad_by_uni = grad_competitors[['ACADEMIC_YEAR', 'NOMBRE_INS_PLA', 'V142']].groupby(['ACADEMIC_YEAR', 'NOMBRE_INS_PLA']).agg({'V142': 'sum'}).reset_index()
n_grad_by_uni.rename(columns={'V142': 'N_STUDENTS'}, inplace=True)
n_grad_by_uni

Unnamed: 0,ACADEMIC_YEAR,NOMBRE_INS_PLA,N_STUDENTS
0,2019-2020,ITESM CAMPUS GUADALAJARA,212
1,2019-2020,ITESO,866
2,2019-2020,UNIVERSIDAD AUTONOMA DE GUADALAJARA,913
3,2019-2020,UNIVERSIDAD DE GUADALAJARA,5681
4,2019-2020,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,938
5,2020-2021,ITESM CAMPUS GUADALAJARA,168
6,2020-2021,ITESO,813
7,2020-2021,UNIVERSIDAD AUTONOMA DE GUADALAJARA,827
8,2020-2021,UNIVERSIDAD DE GUADALAJARA,5645
9,2020-2021,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,897


#### Number of programs

In [156]:
# Undergraduate
undergrad_nprograms = undergrad_competitors[['ACADEMIC_YEAR', 'NOMBRE_INS_PLA', 'C_NOM_CARRERA']].drop_duplicates()
undergrad_nprograms = undergrad_nprograms.groupby(['ACADEMIC_YEAR', 'NOMBRE_INS_PLA']).size().reset_index(name='N_PROGRAMS')
undergrad_nprograms

Unnamed: 0,ACADEMIC_YEAR,NOMBRE_INS_PLA,N_PROGRAMS
0,2019-2020,ITESM CAMPUS GUADALAJARA,30
1,2019-2020,ITESO,39
2,2019-2020,UNIVERSIDAD AUTONOMA DE GUADALAJARA,68
3,2019-2020,UNIVERSIDAD DE GUADALAJARA,102
4,2019-2020,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,20
5,2020-2021,ITESM CAMPUS GUADALAJARA,30
6,2020-2021,ITESO,43
7,2020-2021,UNIVERSIDAD AUTONOMA DE GUADALAJARA,84
8,2020-2021,UNIVERSIDAD DE GUADALAJARA,102
9,2020-2021,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,23


In [157]:
# Graduate
grad_nprograms = grad_competitors[['ACADEMIC_YEAR', 'NOMBRE_INS_PLA', 'C_NOM_CARRERA']].drop_duplicates()
grad_nprograms = grad_nprograms.groupby(['ACADEMIC_YEAR', 'NOMBRE_INS_PLA']).size().reset_index(name='N_PROGRAMS')
grad_nprograms


Unnamed: 0,ACADEMIC_YEAR,NOMBRE_INS_PLA,N_PROGRAMS
0,2019-2020,ITESM CAMPUS GUADALAJARA,7
1,2019-2020,ITESO,30
2,2019-2020,UNIVERSIDAD AUTONOMA DE GUADALAJARA,49
3,2019-2020,UNIVERSIDAD DE GUADALAJARA,264
4,2019-2020,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,45
5,2020-2021,ITESM CAMPUS GUADALAJARA,7
6,2020-2021,ITESO,32
7,2020-2021,UNIVERSIDAD AUTONOMA DE GUADALAJARA,58
8,2020-2021,UNIVERSIDAD DE GUADALAJARA,278
9,2020-2021,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,51


#### Cost of attendance

In [None]:
# Undergraduate
undergrad_competitors %>%
mutate(LEVEL = case_when(V1 == 1 ~'TECNICO SUPERIOR',
                 V2 == 1 ~ 'LINCENCIATURA',
                 V3 == 1 ~ 'LINCENCIATURA',
                 V4 == 1 ~ 'LINCENCIATURA',
                 TRUE ~ 'NOT_DEFINED')) %>%%!
select(ACADEMIC_YEAR, NOMBRE_INS_PLA, LEVEL, V455, V456)

In [164]:
# Assign the level of the program
undergrad_competitors['LEVEL'] = np.where(undergrad_competitors['V1'] == 1, 'TECNICO SUPERIOR',
                                          np.where(undergrad_competitors['V2'] == 1, 'LINCENCIATURA',
                                                   np.where(undergrad_competitors['V3'] == 1, 'LINCENCIATURA',
                                                            np.where(undergrad_competitors['V4'] == 1, 'LINCENCIATURA', 'NOT_DEFINED'))))

ug_cost_by_uni = undergrad_competitors[['ACADEMIC_YEAR', 'NOMBRE_INS_PLA', 'LEVEL', 'V455', 'V456']].reset_index()

# Tuition fees = Enrollment + Tuition
ug_cost_by_uni['TUITION_FEES'] = ug_cost_by_uni['V455'] + ug_cost_by_uni['V456']
# Filter the years 2019-2020 and 2020-2021
ug_cost_by_uni = ug_cost_by_uni[~cost_by_uni ['ACADEMIC_YEAR'].isin(['2021-2022', '2022-2023'])]
# Remove the rows with 0 tuition fees
ug_cost_by_uni  = ug_cost_by_uni[cost_by_uni ['TUITION_FEES'] > 0] # This will drop 'UNIVERSIDAD DE GUADALAJARA'
# Group by academic year and faculty
ug_cost_by_uni = ug_cost_by_uni.groupby(['ACADEMIC_YEAR', 'NOMBRE_INS_PLA', 'LEVEL']).agg(AVG_TUITION=('TUITION_FEES', 'mean')).reset_index()
ug_cost_by_uni 

Unnamed: 0,ACADEMIC_YEAR,NOMBRE_INS_PLA,LEVEL,AVG_TUITION
0,2019-2020,ITESM CAMPUS GUADALAJARA,LINCENCIATURA,118700.0
1,2019-2020,ITESO,LINCENCIATURA,185888.157895
2,2019-2020,UNIVERSIDAD AUTONOMA DE GUADALAJARA,LINCENCIATURA,127135.714286
3,2019-2020,UNIVERSIDAD AUTONOMA DE GUADALAJARA,TECNICO SUPERIOR,84850.0
4,2019-2020,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,LINCENCIATURA,141952.347826
5,2020-2021,ITESM CAMPUS GUADALAJARA,LINCENCIATURA,252340.740741
6,2020-2021,ITESO,LINCENCIATURA,189476.904762
7,2020-2021,UNIVERSIDAD AUTONOMA DE GUADALAJARA,LINCENCIATURA,142141.489362
8,2020-2021,UNIVERSIDAD AUTONOMA DE GUADALAJARA,TECNICO SUPERIOR,95450.0
9,2020-2021,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,LINCENCIATURA,169939.592593


In [167]:
# Assign the level of the program
grad_cost_by_uni = grad_competitors[['ACADEMIC_YEAR', 'NOMBRE_INS_PLA', 'V269', 'V270']].reset_index()

# Tuition fees = Enrollment + Tuition
grad_cost_by_uni['TUITION_FEES'] = grad_cost_by_uni['V269'] + grad_cost_by_uni['V270']
# Filter the years 2019-2020 and 2020-2021
grad_cost_by_uni = grad_cost_by_uni[~grad_cost_by_uni['ACADEMIC_YEAR'].isin(['2021-2022', '2022-2023'])]
# Remove the rows with 0 tuition fees
grad_cost_by_uni = grad_cost_by_uni[grad_cost_by_uni['TUITION_FEES'] > 0] # Universidad de Guadalajara excluded
# Group by academic year and faculty
grad_cost_by_uni = grad_cost_by_uni.groupby(['ACADEMIC_YEAR', 'NOMBRE_INS_PLA']).agg(AVG_TUITION=('TUITION_FEES', 'mean')).reset_index()
grad_cost_by_uni

Unnamed: 0,ACADEMIC_YEAR,NOMBRE_INS_PLA,AVG_TUITION
0,2019-2020,ITESM CAMPUS GUADALAJARA,78968.0
1,2019-2020,ITESO,141274.285714
2,2019-2020,UNIVERSIDAD AUTONOMA DE GUADALAJARA,90017.567568
3,2019-2020,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,189915.0
4,2020-2021,ITESM CAMPUS GUADALAJARA,242640.0
5,2020-2021,ITESO,142997.5
6,2020-2021,UNIVERSIDAD AUTONOMA DE GUADALAJARA,98195.714286
7,2020-2021,UNIVERSIDAD PANAMERICANA CAMPUS GUADALAJARA,239303.306122


#### Scholarships

In [None]:
# Graduate