 - Input the 2 data tables for East and West students
 - For the East table, fix the Student ID column so that is just the 5 digit numeric code.
 - For the West Table, Split the Student ID column to leave one column with the 5 digit numeric Student ID and one newly formed Region column.
 - For both tables use the First & Last name columns to create a new “Full Name” column in all capital letters.
 - Make sure both tables have the same date format
 - Convert the values in the Grade column in the West table from numbers to letters using the following system:
 - 1 = A, 2 = B, 3 = C, 4 = D, 5 = E, 6 = F
 - Combine the 2 tables to form one table with 12,000 rows
 - Create a new “Grade Score” Column and give each row a score according to the following system:
 - A = 50, B = 40, C = 30, D = 20, E = 10, F = 0
 - Find a way to transform the table so that each student is represented by only ONE row with 3 individual columns for their English, Maths & Science grades as well as a column with their total score from all of these grades combined.
 - Input the School Lookup table and combine it with the current student table to allow us to see which school each student is attending.
 - Clean and organise your table to match the output shown below.
 - Output the data (and keep it for next week!)

In [1]:
import os
import pandas as pd
import numpy as np 


In [2]:
west_df=pd.read_csv("West Students.csv")
east_df=pd.read_csv("East Students.csv")

In [3]:
east_df.columns=east_df.columns.str.lower().str.strip().str.replace(' ','_')
west_df.columns=west_df.columns.str.lower().str.strip().str.replace(' ','_')

In [4]:
west_df.head()

Unnamed: 0,unique_row_id,student_id,first_name,last_name,date_of_birth,subject,grade
0,6001,16141-WEST,Lucio,Passby,25/04/2007,English,6
1,6002,16141-WEST,Lucio,Passby,25/04/2007,Maths,3
2,6003,16141-WEST,Lucio,Passby,25/04/2007,Science,2
3,6004,15994-WEST,Gunner,Dunderdale,28/05/2007,English,2
4,6005,15994-WEST,Gunner,Dunderdale,28/05/2007,Maths,3


In [5]:
east_df.head()

Unnamed: 0,unique_row_id,student_id,last_name,first_name,date_of_birth,subject,grade
0,1,EAST18141,RASSELL,KARLENS,"Saturday, 25 November, 2006",English,C
1,2,EAST18141,RASSELL,KARLENS,"Saturday, 25 November, 2006",Maths,B
2,3,EAST18141,RASSELL,KARLENS,"Saturday, 25 November, 2006",Science,B
3,4,EAST17994,HUDGHTON,ESMARIA,"Tuesday, 10 April, 2007",English,B
4,5,EAST17994,HUDGHTON,ESMARIA,"Tuesday, 10 April, 2007",Maths,B


In [6]:
# For the East table, fix the Student ID column so that is just the 5 digit numeric code.
east_df.student_id=east_df.student_id.str[-5:]
east_df['region']='EAST'

In [7]:
# For the West Table, Split the Student ID column to leave one column with the 5 digit numeric Student ID and one newly formed Region column.
west_df[['student_id','region']]=west_df.student_id.str.split('-',expand=True)

In [8]:
# For both tables use the First & Last name columns to create a new “Full Name” column in all capital letters.
def full_name(fn,ln):
    fullname = ' '.join([fn,ln]).upper()
    return fullname

west_df['full_name']=west_df.first_name +' '+ west_df.last_name
east_df['full_name']=east_df.first_name +' '+ east_df.last_name
west_df['full_name']=west_df['full_name'].str.upper()
east_df['full_name']=east_df['full_name'].str.upper()

In [9]:
# Make sure both tables have the same date format
west_df['date_of_birth']=pd.to_datetime(west_df['date_of_birth'],format=fr'%d/%m/%Y')
east_df['date_of_birth']=pd.to_datetime(east_df['date_of_birth'],format=fr'%A, %d %B, %Y')

In [10]:
# Convert the values in the Grade column in the West table from numbers to letters using the following system:
# 1 = A, 2 = B, 3 = C, 4 = D, 5 = E, 6 = F
m=dict(zip([1,2,3,4,5,6],['A','B','C','D','E','F']))

west_df.grade=west_df.grade.map(m)

In [11]:
# Combine the 2 tables to form one table with 12,000 rows
west_df.drop(columns=['first_name','last_name','unique_row_id'],inplace=True)
east_df.drop(columns=['first_name','last_name','unique_row_id'],inplace=True)

cmb=pd.concat([east_df,west_df])

In [12]:
cmb.head()

Unnamed: 0,student_id,date_of_birth,subject,grade,region,full_name
0,18141,2006-11-25,English,C,EAST,KARLENS RASSELL
1,18141,2006-11-25,Maths,B,EAST,KARLENS RASSELL
2,18141,2006-11-25,Science,B,EAST,KARLENS RASSELL
3,17994,2007-04-10,English,B,EAST,ESMARIA HUDGHTON
4,17994,2007-04-10,Maths,B,EAST,ESMARIA HUDGHTON


In [13]:
# Create a new “Grade Score” Column and give each row a score according to the following system:
# A = 50, B = 40, C = 30, D = 20, E = 10, F = 0

m=dict(zip(['A','B','C','D','E','F'],[50,40,30,20,10,0]))

cmb['grade_score']=cmb.grade.map(m)

cmb.head()

Unnamed: 0,student_id,date_of_birth,subject,grade,region,full_name,grade_score
0,18141,2006-11-25,English,C,EAST,KARLENS RASSELL,30
1,18141,2006-11-25,Maths,B,EAST,KARLENS RASSELL,40
2,18141,2006-11-25,Science,B,EAST,KARLENS RASSELL,40
3,17994,2007-04-10,English,B,EAST,ESMARIA HUDGHTON,40
4,17994,2007-04-10,Maths,B,EAST,ESMARIA HUDGHTON,40


In [14]:
# Find a way to transform the table so that each student is represented by only ONE row with 3 individual columns for their English, Maths & Science grades as well as a column with their total score from all of these grades combined.

# Sum grade scores
grades_df=cmb.groupby('student_id').sum(numeric_only=True).reset_index()


In [15]:
# Pivot subjects
# 1: Remove grade score - Will join it back after using grades_df table
cmb.drop(columns='grade_score',inplace=True)

# 2: Do pivot
cmb=cmb.pivot(index=['student_id', 'date_of_birth', 'region','full_name'],columns='subject', values='grade').copy()

# 3: Merge tables back
cmb.reset_index(inplace=True)
cmb=cmb.merge(grades_df,on='student_id')

In [16]:
cmb.loc[cmb.student_id=='15046']

Unnamed: 0,student_id,date_of_birth,region,full_name,English,Maths,Science,grade_score
46,15046,2006-12-10,WEST,JYOTI REDHOLE,B,E,B,90


In [18]:
school_df=pd.read_csv('School Lookup.csv')

In [21]:
school_df.columns=school_df.columns.str.lower().str.strip().str.replace(' ','_')


In [24]:
cmb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   student_id     4000 non-null   object        
 1   date_of_birth  4000 non-null   datetime64[ns]
 2   region         4000 non-null   object        
 3   full_name      4000 non-null   object        
 4   English        4000 non-null   object        
 5   Maths          4000 non-null   object        
 6   Science        4000 non-null   object        
 7   grade_score    4000 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 281.2+ KB


In [28]:
school_df.student_id=school_df.student_id.astype('str')

In [30]:
cmb=school_df.merge(cmb,on='student_id')

In [32]:
cmb.shape


(4000, 10)