# Importing the required modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy

# Reading the dataset

In [2]:
curr = pd.read_csv("../data/curriculum.csv")

# Glimpse into the dataset

In [3]:
curr.head()

Unnamed: 0,Courses,Topic,Duration,Effort,Total Hours Lower Bound,Start Date,End Date Estimate Lower Bound,Total Hours Upper Bound,Start Date.1,End Date Estimate Upper Bound,Actual End Date,Prerequisites
0,Python for Everybody,Intro CS,10 weeks,10 hours/week,100,28-07-2022,01-09-2022,100,28-07-2022,01-09-2022,,-
1,Introduction to Computer Science and Programmi...,Intro CS,9 weeks,15 hours/week,135,01-09-2022,18-10-2022,135,01-09-2022,18-10-2022,,high school algebra
2,How to Code - Simple Data,Core Programming,7 weeks,8-10 hours/week,56,18-10-2022,06-11-2022,70,18-10-2022,11-11-2022,,-
3,How to Code - Complex Data,Core Programming,6 weeks,8-10 hours/week,48,06-11-2022,23-11-2022,60,11-11-2022,02-12-2022,,How to Code: Simple Data
4,"Programming Languages, Part A",Core Programming,5 weeks,4-8 hours/week,20,23-11-2022,30-11-2022,40,02-12-2022,16-12-2022,,How to Code (Hear instructor)


In [4]:
curr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Courses                        43 non-null     object 
 1   Topic                          43 non-null     object 
 2   Duration                       42 non-null     object 
 3   Effort                         42 non-null     object 
 4   Total Hours Lower Bound        43 non-null     int64  
 5   Start Date                     43 non-null     object 
 6   End Date Estimate Lower Bound  43 non-null     object 
 7   Total Hours Upper Bound        43 non-null     int64  
 8   Start Date.1                   43 non-null     object 
 9   End Date Estimate Upper Bound  43 non-null     object 
 10  Actual End Date                0 non-null      float64
 11  Prerequisites                  42 non-null     object 
dtypes: float64(1), int64(2), object(9)
memory usage: 4.2

# Alloting course codes to each course

In [55]:
def get_course_code(row):
    topic = row[1].title()
    topic = "".join(re.findall("[A-Z]",topic))
    courses_in_topic = list(curr[curr["Topic"] == row[1]]["Courses"])
    course_code = str(courses_in_topic.index(row[0]) + 1).rjust(3,"0")
    return topic + course_code

In [6]:
curr["Course Code"] = curr.apply(get_course_code,axis=1)

# Handling Course Pre-requisites

## Finding most relevant courses for prequisites using Cosine Similiarity

In [17]:
df = pd.DataFrame(columns=["Prerequisite"] + list(curr["Courses"]) + list(curr["Topic"]))

In [18]:
df

Unnamed: 0,Prerequisite,Python for Everybody,Introduction to Computer Science and Programming using Python,How to Code - Simple Data,How to Code - Complex Data,"Programming Languages, Part A","Programming Languages, Part B","Programming Languages, Part C",Object-Oriented Design,Design Patterns,...,Core Ethics,Core Ethics.1,Core Ethics.2,Advanced Programming,Advanced Programming.1,Advanced Programming.2,Advanced Programming.3,Advanced Programming.4,Advanced Programming.5,Final Project


In [19]:
nlp = spacy.load("en_core_web_sm")
for i in set(curr["Prerequisites"]): 
    if str(type(i)) != "<class 'float'>":
        for k in i.split(";"):
            prereq = nlp(i.strip())
            row = {"Prerequisite":k}
            if i in df["Prerequisite"]:
                continue
            for j in list(curr["Courses"]) + list(curr["Topic"]):
                course = nlp(j.strip())
                row[j] = prereq.similarity(course)
            df = df.append(row,ignore_index=True)

  row[j] = prereq.similarity(course)


In [20]:
#df = df.set_index("Prerequisite")

In [21]:
df

Unnamed: 0,Prerequisite,Python for Everybody,Introduction to Computer Science and Programming using Python,How to Code - Simple Data,How to Code - Complex Data,"Programming Languages, Part A","Programming Languages, Part B","Programming Languages, Part C",Object-Oriented Design,Design Patterns,...,Core Ethics,Core Ethics.1,Core Ethics.2,Advanced Programming,Advanced Programming.1,Advanced Programming.2,Advanced Programming.3,Advanced Programming.4,Advanced Programming.5,Final Project
0,Mathematics for Computer Science,0.664016,0.763913,0.518698,0.558219,0.546759,0.544172,0.546839,0.353599,0.553546,...,0.588003,0.588003,0.588003,0.530212,0.530212,0.530212,0.530212,0.530212,0.530212,0.555373
1,From Nand to Tetris Part I,0.658241,0.709123,0.451229,0.459345,0.448134,0.400455,0.40159,0.132073,0.395237,...,0.477251,0.477251,0.477251,0.407808,0.407808,0.407808,0.407808,0.407808,0.407808,0.508955
2,Object-Oriented Design,0.180903,0.282555,0.539156,0.562732,0.320771,0.366587,0.315625,1.0,0.428648,...,0.30517,0.30517,0.30517,0.575503,0.575503,0.575503,0.575503,0.575503,0.575503,0.405703
3,"Programming Languages, Part B",0.178415,0.457381,0.534867,0.572722,0.971706,1.0,0.959208,0.366587,0.609751,...,0.625631,0.625631,0.625631,0.543091,0.543091,0.543091,0.543091,0.543091,0.543091,0.523993
4,How to Code: Simple Data,0.347886,0.349117,0.747667,0.712122,0.480058,0.480928,0.519834,0.211499,0.517171,...,0.529959,0.529959,0.529959,0.410602,0.410602,0.410602,0.410602,0.410602,0.410602,0.510676
5,Python for Everybody,0.605964,0.507743,0.566041,0.581837,0.480978,0.4948,0.492586,0.743627,0.45292,...,0.352542,0.352542,0.352542,0.541039,0.541039,0.541039,0.541039,0.541039,0.541039,0.463887
6,Object-Oriented Design,0.605964,0.507743,0.566041,0.581837,0.480978,0.4948,0.492586,0.743627,0.45292,...,0.352542,0.352542,0.352542,0.541039,0.541039,0.541039,0.541039,0.541039,0.541039,0.463887
7,C++ for Everybody,0.621831,0.449858,0.333551,0.35269,0.488279,0.438433,0.499209,0.144365,0.438784,...,0.431127,0.431127,0.431127,0.345829,0.345829,0.345829,0.345829,0.345829,0.345829,0.462576
8,Java for Everybody,0.621831,0.449858,0.333551,0.35269,0.488279,0.438433,0.499209,0.144365,0.438784,...,0.431127,0.431127,0.431127,0.345829,0.345829,0.345829,0.345829,0.345829,0.345829,0.462576
9,Linear Algebra,0.621831,0.449858,0.333551,0.35269,0.488279,0.438433,0.499209,0.144365,0.438784,...,0.431127,0.431127,0.431127,0.345829,0.345829,0.345829,0.345829,0.345829,0.345829,0.462576


In [22]:
def get_most_relevant_course(row):
    best_similarity = max(row[1:])
    cols = list(df.columns)
    i = list(row[1:]).index(best_similarity)
    #print(row[0],"\t",cols[i + 1],"\t",best_similarity)
    #print(row.index,"\n")
    if best_similarity > 0.74:
        return {"Pre":row[0],"Rel":cols[i + 1],"simi":best_similarity,"incorrect":None}
    else:
        return {"Pre":row[0],"Rel":None,"simi":best_similarity,"incorrect":cols[i+1]}

In [23]:
d = pd.DataFrame(columns=["Pre","Rel","simi","incorrect"])

In [24]:
for i in df.apply(get_most_relevant_course,axis=1):
    d = d.append(i, ignore_index=True)

In [25]:
d

Unnamed: 0,Pre,Rel,simi,incorrect
0,Mathematics for Computer Science,Mathematics for Computer Science,1.0,
1,From Nand to Tetris Part I,Build a Modern Computer from First Principles:...,0.829135,
2,Object-Oriented Design,Object-Oriented Design,1.0,
3,"Programming Languages, Part B","Programming Languages, Part B",1.0,
4,How to Code: Simple Data,How to Code - Simple Data,0.747667,
5,Python for Everybody,Object-Oriented Design,0.743627,
6,Object-Oriented Design,Object-Oriented Design,0.743627,
7,C++ for Everybody,,0.626999,Software Engineering: Introduction
8,Java for Everybody,,0.626999,Software Engineering: Introduction
9,Linear Algebra,,0.626999,Software Engineering: Introduction


## Adding Prerequisites to the catalog

In [37]:
def add_prereq(row):
    if row[1] == None and row[0] != "-":
        return {"Courses" : row[0].title(),"Topic":"Prerequisites"}
    return None

In [41]:
for i in d.apply(add_prereq,axis=1):
    if i != None:
        curr = curr.append(i,ignore_index=True)

In [43]:
curr = curr[["Courses","Topic","Duration","Effort"]]

In [44]:
curr.head()

Unnamed: 0,Courses,Topic,Duration,Effort
0,Python for Everybody,Intro CS,10 weeks,10 hours/week
1,Introduction to Computer Science and Programmi...,Intro CS,9 weeks,15 hours/week
2,How to Code - Simple Data,Core Programming,7 weeks,8-10 hours/week
3,How to Code - Complex Data,Core Programming,6 weeks,8-10 hours/week
4,"Programming Languages, Part A",Core Programming,5 weeks,4-8 hours/week


In [45]:
curr.to_csv("../data/curr.csv")

In [46]:
curr = pd.read_csv("../data/curr.csv")

In [52]:
curr = curr.drop(["Unnamed: 0"],axis=1)

In [56]:
curr["Course Code"] = curr.apply(get_course_code,axis=1)

In [57]:
curr

Unnamed: 0,Courses,Topic,Duration,Effort,Course Code
0,Python for Everybody,Intro CS,10 weeks,10 hours/week,IC001
1,Introduction to Computer Science and Programmi...,Intro CS,9 weeks,15 hours/week,IC002
2,How to Code - Simple Data,Core Programming,7 weeks,8-10 hours/week,CP001
3,How to Code - Complex Data,Core Programming,6 weeks,8-10 hours/week,CP002
4,"Programming Languages, Part A",Core Programming,5 weeks,4-8 hours/week,CP003
5,"Programming Languages, Part B",Core Programming,3 weeks,4-8 hours/week,CP004
6,"Programming Languages, Part C",Core Programming,3 weeks,4-8 hours/week,CP005
7,Object-Oriented Design,Core Programming,4 weeks,4 hours/week,CP006
8,Design Patterns,Core Programming,4 weeks,4 hours/week,CP007
9,Software Architecture,Core Programming,4 weeks,2-5 hours/week,CP008
