# Dataframes merge

After exploring the data and doing minor cleaning we are going to merge the data of preinscriptions, matricules and notes so we end up with a unique dataframe that contains only those students that got matriculated after the preinscription. 

In [1]:
import pandas as pd
import re
from unidecode import unidecode

In [2]:
students_df = pd.read_csv('../data/processed/students_df.csv')
uni_access_df = pd.read_csv('../data/processed/uni_access_df.csv')
previous_education_df = pd.read_csv('../data/processed/previous_education_df.csv')
preinscripcions_df = pd.read_csv('../data/processed/preinscripcions_df.csv')
matricules_df = pd.read_csv('../data/processed/matricules_df.csv')
notes_df = pd.read_csv('../data/processed/notes_df.csv')
titulacions_df = pd.read_csv('../data/processed/titulacions_df.csv')

In [3]:
print(f'Preinscripcions shape:{preinscripcions_df.shape}')
print(f'Matricules shape:{matricules_df.shape}')
print(f'Notes shape:{notes_df.shape}')
print('---------')
print(f'Students shape:{students_df.shape}')
print(f'University access info shape:{uni_access_df.shape}')
print(f'Previous education shape:{previous_education_df.shape}')

Preinscripcions shape:(2280, 33)
Matricules shape:(1713, 6)
Notes shape:(41327, 13)
---------
Students shape:(2154, 14)
University access info shape:(2280, 11)
Previous education shape:(2280, 10)


In [4]:
#We first merge preinscripcions and matricules to find out how many students do actually matriculate after preinscripcions.
#We try the join on DNI and codi_grau in case a student did preinscriptions and matriculated multiple times

In [5]:
check_data_merge = pd.merge(preinscripcions_df, matricules_df, on=['DNI'], how='outer', indicator=True)

In [6]:
check_data_merge.shape

(2427, 39)

In [7]:
check_data_merge.columns

Index(['curs_preinscripcio', 'convocatoria', 'DNI', 'codi_centre_secundaria',
       'nacionalitat', 'pais', 'CP', 'poblacio', 'comarca', 'provincia',
       'sexe', 'data_naixement', 'via_acces', 'nota_acces', 'codi_grau_x',
       'ordre_assignacio', 'any_qualificacio', 'convocatoria_qualificacio',
       'estudis_pare', 'estudis_mare', 'ocupacio_pare', 'ocupacio_mare',
       'treball_remunerat', 'orientacio', 'aspectes_eleccio',
       'any_acabament_secundaria', 'tipus_centre_secundaria',
       'tipus_lloc_secundaria', 'codi_lloc_secundaria',
       'estudis_universitaris', 'any_acces_primera_vegada_universitat',
       'titulacio_nivell_assolit', 'modalitat_acces', 'identification',
       'curs_matricula', 'NIA', 'pla', 'codi_grau_y', '_merge'],
      dtype='object')

In [8]:
only_in_matricules = check_data_merge[check_data_merge['_merge'] == 'right_only'].copy()
print(only_in_matricules['curs_matricula'].value_counts())
only_in_matricules.shape
# 125 Rows only in matricules_df that do not appear in preinscriptions--> This shoudn't happen. 
#However, there is two casuistics that might explain this cases:
    # 1. Something wrong with 2019 data --> 70% of cases are from 2019
    # 2. Students used their passport for the preinscription and their DNI for matriculation. So there is no possible match.
    
    #only_in_matricules.to_excel('../data/only_in_matricules.xlsx', index=False)

curs_matricula
2019.0    84
2014.0    13
2016.0     8
2021.0     6
2017.0     4
2022.0     3
2015.0     3
2023.0     2
2018.0     1
2020.0     1
Name: count, dtype: int64


(125, 39)

In [9]:
first_merge = pd.merge(preinscripcions_df, matricules_df, on=['DNI','codi_grau'], how='inner')

In [10]:
#The data in "notes_df" does not have grades for students matriculated on 2023. 
#We will use this students to test our model in the future. We save the students_2023 df.
#We divide this data to work with the rest of the data on our model. 

students_2023 = first_merge[first_merge['curs_matricula']== 2023]
students_2023.to_csv('../data/processed/students_2023.csv', index=False)

In [11]:
first_merge.shape

(1569, 37)

In [12]:
#Drop the rows from the students of 2023 from the preinscriptions and matriculations dataframe
first_merge = first_merge[first_merge['curs_matricula']!= 2023]

In [13]:
first_merge.shape

(1355, 37)

In [14]:
final_merge = pd.merge(first_merge,notes_df, on=['NIA','pla'], how='inner')

In [15]:
#We want to keep only the rows of the 1st year of each student. This is the year where curs_matricula == curs (from notes)
condition = final_merge['curs_matricula'] == final_merge['any_academic']
final_merge = final_merge[condition]

In [16]:
final_merge = final_merge.drop(columns='curs')
final_merge = final_merge.drop(columns='identification')
final_merge = final_merge.drop(columns='DNI') #only need one identification column, redundant with NIA

In [17]:
final_merge.shape

(11426, 45)

In [18]:
final_merge.columns

Index(['curs_preinscripcio', 'convocatoria', 'codi_centre_secundaria',
       'nacionalitat', 'pais', 'CP', 'poblacio', 'comarca', 'provincia',
       'sexe', 'data_naixement', 'via_acces', 'nota_acces', 'codi_grau',
       'ordre_assignacio', 'any_qualificacio', 'convocatoria_qualificacio',
       'estudis_pare', 'estudis_mare', 'ocupacio_pare', 'ocupacio_mare',
       'treball_remunerat', 'orientacio', 'aspectes_eleccio',
       'any_acabament_secundaria', 'tipus_centre_secundaria',
       'tipus_lloc_secundaria', 'codi_lloc_secundaria',
       'estudis_universitaris', 'any_acces_primera_vegada_universitat',
       'titulacio_nivell_assolit', 'modalitat_acces', 'curs_matricula', 'NIA',
       'pla', 'any_academic', 'assignatura', 'codi_assignatura', 'grup',
       'tipus', 'semestre', 'exhaurides', 'superada', 'nota_numerica',
       'nota_alfa'],
      dtype='object')

In [20]:
order_columns = ['NIA','curs_preinscripcio', 'convocatoria','curs_matricula',   
                 'sexe','data_naixement','nacionalitat', 'pais', 'CP', 'poblacio', 'comarca', 'provincia',
                 'via_acces', 'nota_acces', 'ordre_assignacio','orientacio', 'aspectes_eleccio',
                 'any_qualificacio', 'convocatoria_qualificacio',
                 'treball_remunerat','estudis_pare', 'estudis_mare', 'ocupacio_pare', 'ocupacio_mare',
                'codi_centre_secundaria', 'any_acabament_secundaria', 'tipus_centre_secundaria','tipus_lloc_secundaria', 'codi_lloc_secundaria',
                'estudis_universitaris', 'any_acces_primera_vegada_universitat','titulacio_nivell_assolit', 'modalitat_acces',
                 'codi_grau', 'pla', 'any_academic', 'assignatura', 
                   'codi_assignatura', 'grup', 'tipus','semestre', 'exhaurides',
                   'superada', 'nota_numerica', 'nota_alfa']

final_merge = final_merge[order_columns]

In [21]:
final_merge.to_csv('../data/raw/final_merge.csv', index=False)

In [22]:
#To have some context on numbers let's check how many rows appear in the preinscripcio list and in the matriculations list.
preinscripcions_counts = preinscripcions_df['curs_preinscripcio'].value_counts().sort_index()
matricula_counts = matricules_df['curs_matricula'].value_counts().sort_index()

counts_df = pd.DataFrame({
    'Curs_Preinscripcio': preinscripcions_counts,
    'Curs_Matricula': matricula_counts,
    'Count_Difference': preinscripcions_counts - matricula_counts
})

counts_df

Unnamed: 0,Curs_Preinscripcio,Curs_Matricula,Count_Difference
2014,206,180,26
2015,183,143,40
2016,203,173,30
2017,197,160,37
2018,180,124,56
2019,222,159,63
2020,192,133,59
2021,266,205,61
2022,318,217,101
2023,313,219,94
