# Merge of dataframes

After exploring the data and doing minor cleaning we are going to merge the data of preinscriptions, matricules and notes so we end up with a unique dataframe that contains only those students that got matriculated after the preinscription. 

In [1]:
import pandas as pd

In [2]:
preinscripcions_df = pd.read_csv('../data/processed/preinscripcions_df.csv')
matricules_df = pd.read_csv('../data/processed/matricules_df.csv')
notes_df = pd.read_csv('../data/processed/notes_df.csv')
titulacions_df = pd.read_csv('../data/processed/titulacions_df.csv')

In [3]:
#We want to get a df with a list of preinscripcions that ended up being matriculations

#One same student could be twice in preinscription list for different degrees, 
    #If matriculated, this would be reflected as two rows in the matriculation df with same DNI but different codi_grau
    #However, in matricules_df we do not have 'codi_grau', we only have 'pla' so we will first use the information on the 
        #titulacions_df to create the codi_grau

In [4]:
def assign_codi_grau(row):
    if row['pla'] in [1467, 1278]:
        return 21112
    elif row['pla'] in [877, 1464]:
        return 21025
    elif row['pla'] == 878:
        return 21023
    elif row['pla'] in [879, 1462]:
        return 21026
    elif row['pla'] == 1450:
        return 21032
    elif row['pla'] == 1430:
        return 21027
    elif row['pla'] in [880, 1466]:
        return 21022
    else:
        return np.nan

matricules_df['codi_grau'] = matricules_df.apply(assign_codi_grau, axis=1)

In [5]:
# To be able to group by student and codi_grau we create a common field
matricules_df['codi_grau'] = matricules_df['codi_grau'].astype(str)
matricules_df['identification'] = matricules_df[['DNI', 'codi_grau']].agg(' / '.join, axis=1)

In [6]:
matricules_df = matricules_df.sort_values(by='curs_matricula')
matricules_df = matricules_df.groupby('identification').first().reset_index()

In [7]:
matricules_df.shape

(2548, 6)

In [9]:
def filter_matriculations(DNI,matricules_df):
    matriculations = matricules_df[matricules_df['DNI'] == DNI]
    for index, row in matriculations.iterrows():
        codi_graus = list(matriculations.codi_grau)
        if '21112' in codi_graus and len(set(matriculations['curs_matricula'])) == 1:
            rows_to_drop = matriculations[(matriculations['codi_grau'] == '21025') | (matriculations['codi_grau'] == '21026')].index
            matricules_df = matricules_df.drop(rows_to_drop)
    return matricules_df

In [10]:
dni_counts_dict = matricules_df['DNI'].value_counts().to_dict()
dni_list_3_counts = [DNI for DNI, count in dni_counts_dict.items() if count == 3]
for dni in dni_list_3_counts:
    filter_matriculations(dni,matricules_df)

KeyError: '[1236, 1237] not found in axis'

In [None]:
matricules_df.shape

In [None]:
filtered_matricules_df = []
for i in matricules_df

In [None]:
# We observe that in many cases where the ID is repeated 3 times it is because students from the double degree appear
#matriculated 3 times on the same year. We are going to reduce this:

def reduce_duplicated_matriculations(df):
    matricules_df = matricules_df.groupby('DNI')
    if matricules_df['codi_grau'][0] == 21025  

In [None]:
matricules_df['curs_matricula'].value_counts().sort_index()
#Since we have data from 2013 that year is adding all uni students matriculated that year (1st appearance of their DNI).
#We will drop students with curs_matricula = 2013 so we can properly work with data starting at 2014
matricules_df = matricules_df[matricules_df['curs_matricula'] > 2013]

In [None]:
#To have some context on numbers let's check how many rows appear in the preinscripcio list and in the matriculations list.
preinscripcions_counts = preinscripcions_df['curs_preinscripcio'].value_counts().sort_index()
matricula_counts = matricules_df['curs_matricula'].value_counts().sort_index()

counts_df = pd.DataFrame({
    'Curs_Preinscripcio': preinscripcions_counts,
    'Curs_Matricula': matricula_counts,
    'Count_Difference': preinscripcions_counts - matricula_counts
})

counts_df

In [None]:
#OJO! En 2017 hay una matrícula que no aparece en preinscripción

In [None]:
column00 = matricules_df['DNI']
column0 = matricules_df['first_matriculation']
column1 = preinscripcions_df['curs_preinscripcio']  # Replace 'column_name1' with the actual column name from df1
column2 = matricules_df['curs_matricula']  # Replace 'column_name2' with the actual column name from df2

comparative_df = pd.concat([column00, column0, column1, column2], axis=1)

comparative_df[comparative_df['curs_matricula']==2017.0].head(60)


In [None]:
#Tried left and inner join between preinscriptions_df and matricules_df on DNI:
    #Inner join shape: (1677, 36)
    #Left join (pre_df,matricules_df): (2280, 36)
    #Righ join (pre_df,matricules_df): (2234,36)
        # ---> 603 students are in preeinscription list but do not appear in matricules
        # ---> 557 students matriculated but do not appear in preinscriptions lists (STUDENTS THAT WERE IN PREVIOUS PREINSCRIPTION LISTS)

In [None]:
merged_preinscriptions_matricules = pd.merge(pre_df, matricules_df, on='DNI', how='right')
merged_preinscriptions_matricules.shape

In [None]:
merged_preinscriptions_matricules.columns

In [None]:
students_counts_combined = merged_preinscriptions_matricules.groupby(['Curs_Preinscripcio', 'Curs_Matricula']).size().reset_index(name='Count')
students_counts_combined
#Merge DNI i titulació! Utilitzar taula de titulacions. És possible que ja estigui cursant algo i entri a un altre. 
#Pla i preeinscripció.

In [None]:
#Check relationship between Curs Preinscripció and Curs Matrícula
pivot_table_counts = merged_preinscriptions_matricules.pivot_table(index='Curs_Preinscripcio', columns='Curs_Matricula', aggfunc='size', fill_value=0)
table_df = pivot_table_counts.reset_index()
table_df

In [None]:
#Tried left and inner join between matricules_df and notes_df on NIA:
    # inner join shape: (41327, 16)
    # left join shape: (41613,16)
    # righ join shape: (41327, 16)
        # ---> 286 students appear in matricules but do not have grades; STUDENTS OF 2023!!!
        # ---> All students with grades appear in matricules. 

In [None]:
#The df contains all matriculations that a student(DNI) has made. Since we are only analyzing the first semester of the student
#we are interested in keeping only the first year (lowest year) a student(DNI) got matriculated. 
matricules_df = matricules_df.sort_values(by='curs_matricula')
matricules_df = matricules_df.groupby('DNI').first().reset_index()
matricules_df.shape

In [None]:
merged_matricules_notes = pd.merge( matricules_df, notes_df, on='NIA', how='left')
merged_matricules_notes

## Complete merge

In [None]:
merged_preinscriptions_matricules = pd.merge(pre_df, matricules_df, on='DNI', how='inner')
final_merged_df = pd.merge(merged_preinscriptions_matricules, notes_df, on='NIA', how='inner')

final_merged_df.columns

In [None]:
final_merged_df_sorted = final_merged_df.sort_values(by='Curs_Matricula')

In [None]:
filtered_df = final_merged_df_sorted[final_merged_df_sorted['Curs_Matricula'] == final_merged_df_sorted['ANO_ACADEMICO']]

In [None]:
filtered_df.shape

In [None]:
group_nia = filtered_df.groupby('NIA').

In [None]:
for name, group in group_nia:
    print(f"Group: {name}")
    print(group.head())  # Display the first few rows of the group
    print("\n")

In [None]:
average_notanumericacta = group_nia['NOTANUMERICAACTA'].mean().head(10)

# Display the resulting Series
print(average_notanumericacta)

In [None]:
count_zeros = (average_notanumericacta < 5).sum()

In [None]:
count_zeros

In [None]:
#Assignatures segon semestre: NP o NV --> Ho han deixat
#Número d'assignatures suspeses
#Primera vegada matriculat l'aproba? 