In [43]:
# Import the pandas library and give it an alias 'pd'
import pandas as pd

In [44]:
# Import the 'drive' module from the 'google.colab' library
from google.colab import drive

# Mount the Google Drive into the '/content/drive' directory in the Colab environment
# This allows you to access files stored in your Google Drive from within the Colab notebook
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
# Read the CSV file from the specified path in your Google Drive
# and store the data in a DataFrame called 'df'
df = pd.read_csv('/content/drive/MyDrive/Proyecto/docentes_RAW.csv')

In [46]:
# Get the dimensions of the DataFrame 'df'
# 'df.shape' returns a tuple representing (number_of_rows, number_of_columns) in the DataFrame
shape = df.shape

In [47]:
# Get the column names of the DataFrame 'df'
# 'df.columns' returns a list containing the names of all columns in the DataFrame
columns_list = df.columns

In [49]:
# Get the last few rows of the DataFrame 'df'
# 'df.tail()' returns the last 5 rows of the DataFrame by default
df.tail()

Unnamed: 0,DOCENTE,ASIGNATURAS,GRUPO,HORAS,COMPONENTE
164,MÓDULO,módulo iv. diseña y gestiona bases de datos of...,5B,,Módulo
165,MÓDULO,módulo iv. diseña y gestiona bases de datos of...,5F,,Módulo
166,MÓDULO,módulo iv. diseña y mantiene los sistemas de i...,5D,,Módulo
167,MÓDULO,"módulo iv. mantiene equipos hidráulicos, neumá...",5E,,Módulo
168,MÓDULO,"módulo iv. mantiene equipos hidráulicos, neumá...",5H,,Módulo


In [51]:
# Get the data types of each column in the DataFrame 'df'
# 'df.dtypes' returns a Series containing the data types of each column
df.dtypes

DOCENTE         object
ASIGNATURAS     object
GRUPO           object
HORAS          float64
COMPONENTE      object
dtype: object

In [52]:
# Extract the 'DOCENTE' column from the DataFrame 'df' and capitalize the titles
# 'str.title()' capitalizes the first letter of each word in the 'DOCENTE' column
df_docentes = df['DOCENTE'].str.title()

# Get unique values from the 'df_docentes' Series to obtain a list of unique docentes
df_docentes_unicos = df_docentes.unique()

# Create a new DataFrame 'df_docentes_unicos' with a single column named 'DOCENTES'
# and populate it with the unique docentes obtained in the previous step
df_docentes_unicos = pd.DataFrame({'DOCENTES': df_docentes_unicos})

In [53]:
# Replace the 'DOCENTE' column in the original DataFrame 'df' with the capitalized version
df['DOCENTE'] = df_docentes

# Get the last few rows of the DataFrame 'df_docentes_unicos'
# 'tail()' returns the last 5 rows of the DataFrame by default
df_docentes_unicos.tail()

Unnamed: 0,DOCENTES
54,Sanchez Carrizales Andrea Sofia
55,Torres Vazquez Luis Miguel
56,Lopez Romero Maria Fernanda
57,Sanchez Solano Andres Felipe
58,Módulo


In [54]:
# Get the total number of unique docentes in the DataFrame 'df_docentes_unicos'
# 'df_docentes_unicos.size' returns the number of elements in the DataFrame
df_docentes_unicos.size

59

In [55]:
# Extract the 'ASIGNATURAS' column from the DataFrame 'df' and convert it to uppercase
# 'str.upper()' converts the 'ASIGNATURAS' column to uppercase
df_asignaturas = df['ASIGNATURAS'].str.upper()

# Get unique values from the 'df_asignaturas' Series to obtain a list of unique asignaturas
df_asignaturas_unicas = df_asignaturas.unique()

# Create a new DataFrame 'df_asignaturas_unicas' with a single column named 'ASIGNATURAS'
# and populate it with the unique asignaturas obtained in the previous step
df_asignaturas_unicas = pd.DataFrame({'ASIGNATURAS': df_asignaturas_unicas})


In [56]:
# Replace the 'ASIGNATURAS' column in the original DataFrame 'df' with the uppercase version
df['ASIGNATURAS'] = df_asignaturas

# Get the last few rows of the DataFrame 'df_asignaturas_unicas'
# 'tail()' returns the last 5 rows of the DataFrame by default
df_asignaturas_unicas.tail()

Unnamed: 0,ASIGNATURAS
44,MÓDULO IV. DESARROLLA SOFTWARE DE APLICACIÓN W...
45,MÓDULO IV. DETERMINA LAS CONTRIBUCIONES FISCAL...
46,MÓDULO IV. DISEÑA Y GESTIONA BASES DE DATOS OF...
47,MÓDULO IV. DISEÑA Y MANTIENE LOS SISTEMAS DE I...
48,"MÓDULO IV. MANTIENE EQUIPOS HIDRÁULICOS, NEUMÁ..."


In [57]:
# Fill the NaN values in the "HORAS" column with the number 0
# The 'fillna' method is used to replace missing values (NaN) with the specified value (0) in the specified column ("HORAS")
# The 'inplace=True' argument makes the changes directly in the original DataFrame 'df'
df["HORAS"].fillna(0, inplace=True)

In [58]:
# Convert the values in the "HORAS" column to integers
# The 'astype(int)' method is used to cast the values in the "HORAS" column to integer data type
df["HORAS"] = df["HORAS"].astype(int)

In [60]:
# Show the updated dataframe
df

Unnamed: 0,DOCENTE,ASIGNATURAS,GRUPO,HORAS,COMPONENTE
0,Garcia Hernandez Luis Eduardo,ÁLGEBRA,1B,4,Asignatura
1,Ramirez Perez Juana Maria,ÁLGEBRA,1F,4,Asignatura
2,Lopez Rodriguez Antonio Manuel,ÁLGEBRA,1A,4,Asignatura
3,Lopez Rodriguez Antonia Maria,ÁLGEBRA,1C,4,Asignatura
4,Lopez Rodriguez Antonia Maria,ÁLGEBRA,1D,4,Asignatura
...,...,...,...,...,...
164,Módulo,MÓDULO IV. DISEÑA Y GESTIONA BASES DE DATOS OF...,5B,0,Módulo
165,Módulo,MÓDULO IV. DISEÑA Y GESTIONA BASES DE DATOS OF...,5F,0,Módulo
166,Módulo,MÓDULO IV. DISEÑA Y MANTIENE LOS SISTEMAS DE I...,5D,0,Módulo
167,Módulo,"MÓDULO IV. MANTIENE EQUIPOS HIDRÁULICOS, NEUMÁ...",5E,0,Módulo


In [61]:
# Group the DataFrame 'df' by the 'DOCENTE' column and calculate the sum of 'HORAS' for each docente
# The result will be a new DataFrame 'horas_por_docente' with two columns: 'DOCENTE' and 'HORAS'
horas_por_docente = df.groupby('DOCENTE')['HORAS'].sum().reset_index()

In [62]:
# Sort the DataFrame 'horas_por_docente' by the 'HORAS' column in descending order
# This will arrange docentes in descending order based on their total hours
horas_por_docente = horas_por_docente.sort_values(by='HORAS', ascending=False)

# Reset the index of the DataFrame 'horas_por_docente' after sorting
# The 'drop=True' argument avoids creating a new column with the old index
horas_por_docente = horas_por_docente.reset_index(drop=True)

In [66]:
# Show the resulting DataFrame.
horas_por_docente

Unnamed: 0,DOCENTE,HORAS
0,Reyes Rodriguez Esteban Alejandro,31
1,Ojeda Rivera Lucia Maria,31
2,Nuñez Mejia Pedro Pablo,27
3,Orozco Ramirez Pablo Ismael,25
4,Magaña Silva Maria Guadalupe,25
5,Velazquez Hernandez Maria Elena,24
6,Gonzalez Lopez Ana Luisa,24
7,Campos Garcia Juan Carlos,20
8,Garcia Alvarez Andres,20
9,Rodriguez Rosales Luis Enrique,20


In [68]:
# Filter the DataFrame 'df' to only include rows where the 'GRUPO' column is '1F'
# The result will be a new DataFrame 'grupo_1f' containing only the rows for group 1F
grupo_1f = df[df['GRUPO'] == '1F'][['ASIGNATURAS', 'DOCENTE']]

# Show the resulting DataFrame.
grupo_1f

Unnamed: 0,ASIGNATURAS,DOCENTE
1,ÁLGEBRA,Ramirez Perez Juana Maria
80,INGLÉS I,Orozco Ramirez Pablo Ismael
104,"LECTURA, EXPRESIÓN ORAL Y ESCRITA I",Gonzalez Lopez Ana Luisa
115,LÓGICA,Odriz Nazario Fernando
120,QUÍMICA I,Campos Garcia Juan Carlos
150,TECNOLOGÍAS DE LA INFORMACIÓN Y LA COMUNICACIÓN,Lopez Romero Maria Fernanda


In [69]:
# Create a new DataFrame 'pedro_pablo' containing rows where the 'DOCENTE' column is 'Nuñez Mejia Pedro Pablo'
# Select only the 'ASIGNATURAS', 'GRUPO', and 'HORAS' columns for the new DataFrame
pedro_pablo = df[df['DOCENTE'] == 'Nuñez Mejia Pedro Pablo'][['ASIGNATURAS', 'GRUPO', 'HORAS']]

# Calculate the total hours by summing the 'HORAS' column in the 'pedro_pablo' DataFrame
total_horas = pedro_pablo['HORAS'].sum()

# Create a new DataFrame 'total_row' with a single row containing the total hours
# The row consists of values ['Total Horas', ' ', total_horas] for 'ASIGNATURAS', 'GRUPO', and 'HORAS' respectively
total_row = pd.DataFrame([['Total Horas', ' ', total_horas]], columns=['ASIGNATURAS', 'GRUPO', 'HORAS'])

# Concatenate the original 'pedro_pablo' DataFrame with the 'total_row' DataFrame
# 'ignore_index=True' ensures a new index is generated for the concatenated DataFrame
pedro_pablo = pd.concat([pedro_pablo, total_row], ignore_index=True)

# The DataFrame will contain the filtered data for the docente 'Nuñez Mejia Pedro Pablo'
# It will also have an additional row showing the total hours for that docente
pedro_pablo


Unnamed: 0,ASIGNATURAS,GRUPO,HORAS
0,QUÍMICA I,1C,4
1,QUÍMICA I,1D,4
2,QUÍMICA I,1E,4
3,REALIZA MANTENIMIENTO A LAS INSTALACIONES ELÉC...,5D,4
4,REGISTRA INFORMACIÓN CONTABLE EN FORMA ELECTRÓ...,3G,11
5,Total Horas,,27
