### Part 2: Processing Data and Testing Models
Now that the transcripts have been generated using `data_generation.ipynb`, we can process the transcripts into features and labels to feed into different models.

In [62]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split
import math

In [63]:
transcripts = pd.read_csv('transcripts.csv', index_col=0)
transcripts.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,DIRECTED STUDY,INTRODUCTION TO COMPUTER ENGINEERING,INTRODUCTION TO DISCRETE MATHEMATICS,MACHINE ORGANIZATION AND PROGRAMMING,DIGITAL SYSTEM FUNDAMENTALS,INTRODUCTION TO ARTIFICIAL INTELLIGENCE,PROBLEM SOLVING USING COMPUTERS,INTRODUCTION TO OPERATING SYSTEMS,INTRODUCTION TO ALGORITHMS,DATABASE MANAGEMENT SYSTEMS: DESIGN AND IMPLEMENTATION,...,MEDICAL IMAGE ANALYSIS,VIRTUAL REALITY,COMPUTER ANIMATION,INTRODUCTION TO COMPUTATIONAL GEOMETRY,FUNDAMENTALS OF HUMAN-COMPUTER INTERACTION,COMPUTATIONAL METHODS FOR MEDICAL IMAGE ANALYSIS,CONTEST-LEVEL PROGRAMMING,USER EXPERIENCE DESIGN 1,ADVANCED LINEAR PROGRAMMING,SENIOR THESIS
0,A,B,BC,,,,,,,B,...,,,,,,,,,,
1,,,B,,BC,,,,,,...,,,,,,A,,,,
2,,,,,A,,,,,,...,B,,,,,,,,,
3,,,,,B,,,,,,...,,,,,,,,,,
4,A,,AB,,BC,,,,BC,,...,,,,,,,,,,


Some helpful dictionaries to help us in converting data representations:

In [64]:
grade_to_label_dict = {'A': 1, 'AB': 2, 'B': 3, 'BC': 4, 'C': 5, 'D': 6, 'F': 7}
label_to_grade_dict = {value: key for key, value in grade_to_label_dict.items()}

In [65]:
course_to_index_dict = {column: i for i, column in enumerate(transcripts.columns)}
index_to_course_dict = {i: column for i, column in enumerate(transcripts.columns)}

### Approach 1

Take everything but one course from the transcripts as the features, and put that course as the label:

In [5]:
x = transcripts.drop(['PROGRAMMING I'], axis=1)
y = transcripts['PROGRAMMING I']

Let's check the number of columns for x:

In [6]:
pd.get_dummies(x).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Columns: 135 entries, INTRODUCTION TO COMPUTER NETWORKS to DIRECTED STUDY_A
dtypes: float64(2), uint8(133)
memory usage: 7.7 KB


Let's say that rows 51 through 150 have to be used for testing. The number of columns are:

In [7]:
pd.get_dummies(transcripts.loc[51:150].drop(['PROGRAMMING I'], axis=1)).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 2 columns):
INTRODUCTION TO COMPUTER NETWORKS    0 non-null float64
DIGITAL SYSTEM FUNDAMENTALS          0 non-null float64
dtypes: float64(2)
memory usage: 0.0 bytes


Since the number of columns change every time we cahnge a course or change the rows that are used to train or test the data, we cannot use this approach.

### Approach 2

We can preprocess the rows by dropping all courses that a student hasn't taken and converting the courses and grades to integer values:

In [8]:
def preprocess_data(row):
    row = row.dropna()
    items = []
    items.extend([course_to_index_dict[index] for index in row.index])
    items.extend([grade_to_label_dict[value] for value in row.values])
    return pd.Series(items)

transcripts_preprocessed = transcripts.apply(preprocess_data, axis=1)

Utility function to get the features for each course:

In [9]:
def get_subject_processed(row, subject):
    num_subjects = int(len(row) / 2)
    if subject in row[0:num_subjects].values:
        row = row.append(pd.Series(subject), ignore_index=True)
        index = row[row == subject].index[0]
        row = row.drop([index, index+num_subjects])
        return row.reset_index(drop=True)
    else:
        return None

def get_features_for_course(course, head=False):
    if head:
        df = transcripts_preprocessed.head().apply(get_subject_processed, args=(course,), axis=1) \
            .dropna().reset_index(drop=True)
    else:
        df = transcripts_preprocessed.apply(get_subject_processed, args=(course,), axis=1) \
            .dropna().reset_index(drop=True)
    if type(df) == pd.Series:
        return pd.DataFrame.from_dict(dict(df), orient='index')
    return df

In [10]:
get_features_for_course(8, head=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,7,19,21,23,29,34,35,3,5,1,3,2,2,1,8
1,20,21,23,25,33,34,37,1,1,2,1,6,2,3,8


Utility function to get the label for each course:

In [11]:
def get_labels_processed(row, subject):
    num_subjects = int(len(row) / 2)
    if subject in row[0:num_subjects].values:
        index = row[row == subject].index[0]
        return row[index+num_subjects]
    else:
        return None

def get_labels_for_course(course, head=False):
    if head:
            return transcripts_preprocessed.head().apply(get_labels_processed, args=(course,), axis=1).dropna().reset_index(drop=True)
    return transcripts_preprocessed.apply(get_labels_processed, args=(course,), axis=1).dropna().reset_index(drop=True)

In [12]:
get_labels_for_course(8, head=True)

0    2.0
1    1.0
dtype: float64

Utility function to train and test a model with approach 2:

In [13]:
def train_model(model):
    x_tests = []
    y_tests = []
    for i in range(len(course_to_index_dict)):
        x = get_features_for_course(i)
        y = get_labels_for_course(i)
        x_train, x_test, y_train, y_test = train_test_split(x, y)
        if len(x) > 0 and len(y_train.unique()) > 1:
            model.fit(x_train, y_train)
            x_tests.append(x_test)
            y_tests.append(y_test)
    
    x_tests = pd.concat(x_tests).reset_index(drop=True)
    y_tests = pd.concat(y_tests).reset_index(drop=True)
    accuracy = model.score(x_tests, y_tests)
    
    return (model, accuracy)

#### Logistic Regression Model

In [14]:
model = LogisticRegression(random_state=0, solver='sag', multi_class='multinomial', max_iter=10_000)
model, accuracy = train_model(model)
print(accuracy)

0.44180607471863065


#### Naive Bayes Model

In [15]:
model = GaussianNB()
model, accuracy = train_model(model)
print(accuracy)

0.5278101190840943


### Approach 3

Here we just replace all missing values with zeros and replace all grades with their integer representations:

In [66]:
transcripts_preprocessed = transcripts.fillna(0).replace(grade_to_label_dict).astype(int)

In [67]:
transcripts_preprocessed.head()

Unnamed: 0,DIRECTED STUDY,INTRODUCTION TO COMPUTER ENGINEERING,INTRODUCTION TO DISCRETE MATHEMATICS,MACHINE ORGANIZATION AND PROGRAMMING,DIGITAL SYSTEM FUNDAMENTALS,INTRODUCTION TO ARTIFICIAL INTELLIGENCE,PROBLEM SOLVING USING COMPUTERS,INTRODUCTION TO OPERATING SYSTEMS,INTRODUCTION TO ALGORITHMS,DATABASE MANAGEMENT SYSTEMS: DESIGN AND IMPLEMENTATION,...,MEDICAL IMAGE ANALYSIS,VIRTUAL REALITY,COMPUTER ANIMATION,INTRODUCTION TO COMPUTATIONAL GEOMETRY,FUNDAMENTALS OF HUMAN-COMPUTER INTERACTION,COMPUTATIONAL METHODS FOR MEDICAL IMAGE ANALYSIS,CONTEST-LEVEL PROGRAMMING,USER EXPERIENCE DESIGN 1,ADVANCED LINEAR PROGRAMMING,SENIOR THESIS
0,1,3,4,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
1,0,0,3,0,4,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
3,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,2,0,4,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0


Then we create copies of rows, one copy for each grade that exists in the row, and iteratively replace a grade with -1 and store the row as a feature and the grade as a label:

In [68]:
x = []
y = []
for index, grades in transcripts_preprocessed.iterrows():
    grade_indices = grades[grades != 0].index
    for course in grade_indices:
        new_grades = grades.copy()
        new_grades[course] = -1
        x.append(new_grades)
        y.append(grades[course])

Creating training and testing sets for approach 3:

In [69]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

#### Logistic Regression Model

In [70]:
model = LogisticRegression(solver='sag', multi_class='multinomial', max_iter=10_000)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.42134067063768027

In [71]:
def calculate_accuracy(model, x_test, y_test):
    accuracy = 0
    for x, y in zip(x_test, y_test):
        y_pred = model.predict(x.to_numpy().reshape(1, -1))
        distance = abs(y - y_pred)
        accuracy += 1/(math.pow(2, distance))
    return accuracy/len(x_test)

In [72]:
calculate_accuracy(model, x_test, y_test)

0.5954211768915227

#### Naive Bayes Model

In [73]:
model = GaussianNB()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.1988731996895756

In [74]:
calculate_accuracy(model, x_test, y_test)

0.40382591161975023