# Assignment 1

Import the necessary libraries

In [104]:
import numpy as np
import pandas as pd

Helper functions

In [105]:
ANSWER_COL = 'Answer'
IS_VALID_COL = 'IsValid'
SCORE_COL = 'Score'

def is_file_name_with_txt_extension(file_name):
    """ Check if file name has .txt extension
    
    Args:
        file_name (str): file name to check
        
    Returns:
        bool: True if file name has .txt extension, False otherwise
    """

    if file_name.endswith('.txt'):
        return True
    else:
        return False


def add_txt_extension(file_name):
    """ Add .txt extension to file name if it does not have .txt extension

    Args:
        file_name (str): file name

    Returns:
        str: file name with .txt extension
    """

    if is_file_name_with_txt_extension(file_name):
        return file_name
    else:
        return file_name + '.txt'
    
def is_digit(string):
    """ Check if string is digit
    
    Args:
        string (str): string to check
        
    Returns:
        bool: True if string is digit, False otherwise
    """
    try:
        int(string)
        return True
    except ValueError:
        return False


def is_row_valid(row):
    """ Check if row is valid

    Args:
        row (str): row to check

    Returns:
        bool: True if row is valid, False otherwise
    """
    data = row[ANSWER_COL].split(',')
    if len(data) != 26:
        print('Invalid line of data: does not contain exactly 26 values:')
        print(row[ANSWER_COL] + '\n')
        return False
    student_id = data[0]
    if len(student_id) != 9 or student_id[0] != 'N' or (not is_digit(student_id[1:])):
        print('Invalid line of data: N# is invalid')
        print(row[ANSWER_COL] + '\n')
        return False
    return True 

def grade_the_exams(row):
    """ Grade the exams of student

    Args:
        student_info (str): list of student info

    Returns:
        str: student id
        int: point of student
    """

    correct_answer_key = "B,A,D,D,C,B,D,A,C,C,D,B,A,B,A,C,B,D,A,C,A,A,B,D,D"
    correct_answer_array = correct_answer_key.split(',')
    data = row[ANSWER_COL].split(',')
    student_answer = data[1:]
    point = 0
    for index, answer in enumerate(student_answer):
        if len(answer.strip()) == 0:
            continue
        if answer.strip() == correct_answer_array[index]:
            point += 4
        else:
            point -= 1
    return data[0], point


open file with pandas

In [106]:
input_file_name = input('Enter a class file to grade (i.e. class1 for class1.txt): ')
valid_file_name = add_txt_extension(input_file_name)
try:
    '''read file and convert to dataframe with two columns'''
    df = pd.read_csv(valid_file_name, sep=' ' ,header=None) # Don't want to split anything in each line
    print('\n Successfully opened ' + valid_file_name + '\n')
except FileNotFoundError:
    print('\n File cannot be found. \n')
    exit()


 Successfully opened class1.txt



In [107]:
print('**** ANALYZING **** \n')
# Rename column
df.columns = [ANSWER_COL] 
# Add a new column containing boolean value of valid/invalid line
df[IS_VALID_COL] = df.apply(lambda row: is_row_valid(row), axis=1)
# move valid data to a new dataframe
df_valid = df[df[IS_VALID_COL]]

# count number of valid and invalid lines
valid_line = df_valid.shape[0]
invalid_line = df.shape[0] - valid_line

if invalid_line == 0:
    print('No errors found!\n')


**** ANALYZING **** 

No errors found!



Create a dataframe of scores and make student id as index

In [108]:
# grade the exams, creat new dataframe with student id and score
df_score = df_valid.apply(lambda row: grade_the_exams(row), axis=1, result_type='expand') 
# set sutdent id as index
df_score.set_index(0, inplace=True)
# remove name of index column
df_score.index.name = None
# rename column 1 to Score
df_score.rename(columns={1: SCORE_COL}, inplace=True)
df_score.head()

Unnamed: 0,Score
N00000001,59
N00000002,70
N00000003,84
N00000004,73
N00000005,83


Report

In [109]:
mean_score = df_score[SCORE_COL].mean()
highest_score = df_score[SCORE_COL].max()
lowest_score = df_score[SCORE_COL].min()
range_score = highest_score - lowest_score
median_score = df_score[SCORE_COL].median()
print('**** REPORT **** \n')
print('Total valid lines of data: ' + str(valid_line))
print('Total invalid lines of data: ' + str(invalid_line))
print('Mean (average) score: ' + str(mean_score))
print('Highest score: ' + str(highest_score))
print('Lowest score: ' + str(lowest_score))
print('Range of scores: ' + str(range_score))
print('Median score: ' + str(median_score))


**** REPORT **** 

Total valid lines of data: 20
Total invalid lines of data: 0
Mean (average) score: 75.6
Highest score: 91
Lowest score: 59
Range of scores: 32
Median score: 73.0
