# Importing the required modules

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy

# Reading the dataset

In [4]:
curr = pd.read_csv("../data/curriculum.csv")

# Glimpse into the dataset

In [5]:
curr.head()

Unnamed: 0,Courses,Topic,Duration,Effort,Total Hours Lower Bound,Start Date,End Date Estimate Lower Bound,Total Hours Upper Bound,Start Date.1,End Date Estimate Upper Bound,Actual End Date,Prerequisites
0,Python for Everybody,Intro CS,10 weeks,10 hours/week,100,28-07-2022,01-09-2022,100,28-07-2022,01-09-2022,,-
1,Introduction to Computer Science and Programmi...,Intro CS,9 weeks,15 hours/week,135,01-09-2022,18-10-2022,135,01-09-2022,18-10-2022,,high school algebra
2,How to Code - Simple Data,Core Programming,7 weeks,8-10 hours/week,56,18-10-2022,06-11-2022,70,18-10-2022,11-11-2022,,-
3,How to Code - Complex Data,Core Programming,6 weeks,8-10 hours/week,48,06-11-2022,23-11-2022,60,11-11-2022,02-12-2022,,How to Code: Simple Data
4,"Programming Languages, Part A",Core Programming,5 weeks,4-8 hours/week,20,23-11-2022,30-11-2022,40,02-12-2022,16-12-2022,,How to Code (Hear instructor)


In [6]:
curr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Courses                        43 non-null     object 
 1   Topic                          43 non-null     object 
 2   Duration                       42 non-null     object 
 3   Effort                         42 non-null     object 
 4   Total Hours Lower Bound        43 non-null     int64  
 5   Start Date                     43 non-null     object 
 6   End Date Estimate Lower Bound  43 non-null     object 
 7   Total Hours Upper Bound        43 non-null     int64  
 8   Start Date.1                   43 non-null     object 
 9   End Date Estimate Upper Bound  43 non-null     object 
 10  Actual End Date                0 non-null      float64
 11  Prerequisites                  42 non-null     object 
dtypes: float64(1), int64(2), object(9)
memory usage: 4.2

# Alloting course codes to each course

In [7]:
def get_course_code(row):
    topic = row[1].title()
    topic = "".join(re.findall("[A-Z]",topic))
    courses_in_topic = list(curr[curr["Topic"] == row[1]]["Courses"])
    course_code = str(courses_in_topic.index(row[0]) + 1).rjust(3,"0")
    return topic + course_code

In [8]:
curr["Course Code"] = curr.apply(get_course_code,axis=1)

In [24]:
df.head()

Unnamed: 0_level_0,Python for Everybody,Introduction to Computer Science and Programming using Python,How to Code - Simple Data,How to Code - Complex Data,"Programming Languages, Part A","Programming Languages, Part B","Programming Languages, Part C",Object-Oriented Design,Design Patterns,Software Architecture,...,Core Ethics,Core Ethics.1,Core Ethics.2,Advanced Programming,Advanced Programming.1,Advanced Programming.2,Advanced Programming.3,Advanced Programming.4,Advanced Programming.5,Final Project.1
Prerequisite,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
any programming language,0.448595,0.595221,0.441982,0.499089,0.660203,0.65925,0.64446,0.406581,0.524441,0.612727,...,0.51937,0.51937,0.51937,0.552126,0.552126,0.552126,0.552126,0.552126,0.552126,0.597492
Mathematics for Computer Science,0.448595,0.595221,0.441982,0.499089,0.660203,0.65925,0.64446,0.406581,0.524441,0.612727,...,0.51937,0.51937,0.51937,0.552126,0.552126,0.552126,0.552126,0.552126,0.552126,0.597492
C++,0.281555,0.212375,0.319161,0.284144,0.507478,0.478266,0.528526,0.074031,0.293981,0.356474,...,0.368729,0.368729,0.368729,0.189097,0.189097,0.189097,0.189097,0.189097,0.189097,0.341752
Java,0.281555,0.212375,0.319161,0.284144,0.507478,0.478266,0.528526,0.074031,0.293981,0.356474,...,0.368729,0.368729,0.368729,0.189097,0.189097,0.189097,0.189097,0.189097,0.189097,0.341752
linear algebra,0.281555,0.212375,0.319161,0.284144,0.507478,0.478266,0.528526,0.074031,0.293981,0.356474,...,0.368729,0.368729,0.368729,0.189097,0.189097,0.189097,0.189097,0.189097,0.189097,0.341752


# Handling Course Pre-requisites

In [10]:
df = pd.DataFrame(columns=["Prerequisite"] + list(curr["Courses"]) + list(curr["Topic"]))

In [11]:
df

Unnamed: 0,Prerequisite,Python for Everybody,Introduction to Computer Science and Programming using Python,How to Code - Simple Data,How to Code - Complex Data,"Programming Languages, Part A","Programming Languages, Part B","Programming Languages, Part C",Object-Oriented Design,Design Patterns,...,Core Ethics,Core Ethics.1,Core Ethics.2,Advanced Programming,Advanced Programming.1,Advanced Programming.2,Advanced Programming.3,Advanced Programming.4,Advanced Programming.5,Final Project


In [12]:
nlp = spacy.load("en_core_web_sm")
for i in set(curr["Prerequisites"]): 
    if str(type(i)) != "<class 'float'>":
        for k in i.split(";"):
            prereq = nlp(i)
            row = {"Prerequisite":k}
            if i in df["Prerequisite"]:
                continue
            for j in list(curr["Courses"]) + list(curr["Topic"]):
                course = nlp(j)
                row[j] = prereq.similarity(course)
            df = df.append(row,ignore_index=True)

  row[j] = prereq.similarity(course)


In [22]:
df = df.set_index("Prerequisite")

In [23]:
df

Unnamed: 0_level_0,Python for Everybody,Introduction to Computer Science and Programming using Python,How to Code - Simple Data,How to Code - Complex Data,"Programming Languages, Part A","Programming Languages, Part B","Programming Languages, Part C",Object-Oriented Design,Design Patterns,Software Architecture,...,Core Ethics,Core Ethics.1,Core Ethics.2,Advanced Programming,Advanced Programming.1,Advanced Programming.2,Advanced Programming.3,Advanced Programming.4,Advanced Programming.5,Final Project.1
Prerequisite,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
any programming language,0.448595,0.595221,0.441982,0.499089,0.660203,0.65925,0.64446,0.406581,0.524441,0.612727,...,0.51937,0.51937,0.51937,0.552126,0.552126,0.552126,0.552126,0.552126,0.552126,0.597492
Mathematics for Computer Science,0.448595,0.595221,0.441982,0.499089,0.660203,0.65925,0.64446,0.406581,0.524441,0.612727,...,0.51937,0.51937,0.51937,0.552126,0.552126,0.552126,0.552126,0.552126,0.552126,0.597492
C++,0.281555,0.212375,0.319161,0.284144,0.507478,0.478266,0.528526,0.074031,0.293981,0.356474,...,0.368729,0.368729,0.368729,0.189097,0.189097,0.189097,0.189097,0.189097,0.189097,0.341752
Java,0.281555,0.212375,0.319161,0.284144,0.507478,0.478266,0.528526,0.074031,0.293981,0.356474,...,0.368729,0.368729,0.368729,0.189097,0.189097,0.189097,0.189097,0.189097,0.189097,0.341752
linear algebra,0.281555,0.212375,0.319161,0.284144,0.507478,0.478266,0.528526,0.074031,0.293981,0.356474,...,0.368729,0.368729,0.368729,0.189097,0.189097,0.189097,0.189097,0.189097,0.189097,0.341752
Core Programming,0.257639,0.266951,0.386797,0.395833,0.62101,0.61771,0.620246,0.401957,0.533186,0.548459,...,0.479676,0.479676,0.479676,0.596768,0.596768,0.596768,0.596768,0.596768,0.596768,0.604241
a sizable project,0.257639,0.266951,0.386797,0.395833,0.62101,0.61771,0.620246,0.401957,0.533186,0.548459,...,0.479676,0.479676,0.479676,0.596768,0.596768,0.596768,0.596768,0.596768,0.596768,0.604241
"Programming Languages, Part B",0.178415,0.457381,0.534867,0.572722,0.971706,1.0,0.959208,0.366587,0.609751,0.622449,...,0.625631,0.625631,0.625631,0.543091,0.543091,0.543091,0.543091,0.543091,0.543091,0.523993
-,-0.016595,-0.165618,0.104927,0.105341,0.076003,0.024623,0.073966,0.159847,-0.026023,-0.014116,...,0.004199,0.004199,0.004199,-0.102842,-0.102842,-0.102842,-0.102842,-0.102842,-0.102842,-0.053804
Python,0.347913,0.317743,0.32537,0.327442,0.481084,0.484616,0.528703,0.27988,0.434236,0.527649,...,0.422428,0.422428,0.422428,0.433355,0.433355,0.433355,0.433355,0.433355,0.433355,0.501069


In [14]:
def get_most_relevant_course(row):
    best_similarity = max(row[1:])
    cols = list(df.columns)
    i = list(row[1:]).index(best_similarity)
    #print(row[0],"\t",cols[i + 1],"\t",best_similarity)
    if best_similarity > 0.7:
        return {"Pre":row[0],"Rel":cols[i + 1],"simi":best_similarity,"incorrect":None}
    else:
        return {"Pre":row[0],"Rel":None,"simi":best_similarity,"incorrect":cols[i+1]}

In [15]:
d = pd.DataFrame(columns=["Pre","Rel","simi","incorrect"])

In [16]:
for i in df.apply(get_most_relevant_course,axis=1):
    d = d.append(i, ignore_index=True)

In [17]:
d

Unnamed: 0,Pre,Rel,simi,incorrect
0,any programming language,Mathematics for Computer Science,0.747416,
1,Mathematics for Computer Science,Mathematics for Computer Science,0.747416,
2,C++,,0.645275,Databases: Relational Databases and SQL
3,Java,,0.645275,Databases: Relational Databases and SQL
4,linear algebra,,0.645275,Databases: Relational Databases and SQL
5,Core Programming,,0.666442,Computer Networking: a Top-Down Approach
6,a sizable project,,0.666442,Computer Networking: a Top-Down Approach
7,"Programming Languages, Part B","Programming Languages, Part B",1.0,
8,-,,0.228649,Computer Networking: a Top-Down Approach
9,Python,,0.67351,Software Engineering: Introduction
