In [None]:
import pandas as pd
import matplotlib
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# Plan

Place grades for all questions on the same scale (by fitting a gaussian distribution to questions with large range of scores)


In [None]:
df = pd.read_csv('dfea.csv', sep=';')

In [None]:
question = df[df.author_censor_id == "3B9EF7F8-A6DD-46DD-91C0-BAF4182BE288"][df.question_id == 41702]

In [None]:
question.score_value.head()

In [None]:
grades = question["score_value"]

In [None]:
grades.hist()

Select one censor and plot histograms for each question. There are multiple score with one question_id. Just to understand the data.

In [None]:
censor = df[df.author_censor_id == "D164F89A-486D-49FE-96BF-FAC7DE74E4C7"]
for qid in censor.question_id.unique():
    censor[censor.question_id == qid].score_value.hist()
    print(qid)
    plt.show()

Use RobustScaler for one question and plot several histograms: just raw data and and then scaled data.

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
matplotlib.rcParams['figure.figsize'] = [12, 4]

question = df[df.question_id == 41701]
for cid in question.author_censor_id.unique(): # Kaikki sensorit yksitellen: tehdään hist.
    print("=====================")
    print(cid) # censor_id

    grades = question[question.author_censor_id == cid].score_value.hist(bins=100)
    plt.show()
    
    grades = question[question.author_censor_id == cid].score_value.apply(lambda x: -np.log(101-x)).hist(bins=100)
    plt.show()
    
    grades = question[question.author_censor_id == cid].score_value.apply(lambda x: -np.log(101-x))
    grades_scaled = RobustScaler().fit_transform(pd.DataFrame(grades))
    pd.Series(grades_scaled[:,0]).hist(bins=100)
    plt.xlim([-5,5])
    plt.show()

Take just one censor´s one question and scale scores.

You need {studentid: value} for each combination of (qid, censorid)

In [None]:
qid = 41701 # question id
censorid = "D164F89A-486D-49FE-96BF-FAC7DE74E4C7" # censor id

question = df[df.question_id == 41701] # take just one question
grades = question[question.author_censor_id == censorid].score_value.apply(lambda x: -np.log(101-x)) # one censor and log

rsc = RobustScaler().fit(pd.DataFrame(grades)) # fitataan.

grades_scaled = grades.apply(lambda x: (x - rsc.center_[0]) / rsc.scale_[0] ) # mean to zero

grades_scaled


# Build a system to solve

In [None]:
from collections import defaultdict

In [None]:
index_to_student = df.student_uuid.to_dict()

In [None]:
show_plots = True
rsc = RobustScaler()
students = defaultdict(list) # returns empty list instead of error if key not found.
last_cid = -1
matplotlib.rcParams["figure.figsize"] = [10, 3]

# Part is groupby-chunck for cid, qid i.e. partial dataframe grouped.
for (cid, qid), part in df.groupby((df.author_censor_id, df.question_id)):
    
    # show progress
    if last_cid != cid:
        last_cid = cid
        print(cid)
    
    scores = part.score_value
    max_score = part.score_value.max()
    scale = max_score
    
    if max_score > 10:  # it's a large question with 0-100 scores
        scores = scores.apply(lambda x: -np.log(101-x))
        scale = max_score / 3  # max scale is 3 sigma, meaning 0.3% best students get 100 points
    
    # fit a Gaussian to scores
    rsc.fit(scores.to_frame())
###########################################################################
# multiplication by max_score is our guess, not sure how we should use it #
#    scores = scores.apply(lambda x: ((x - rsc.center_[0]) / rsc.scale_[0]) * scale)  # max_score for importance weight
    scores = scores.apply(lambda x: (x - rsc.center_[0]))  # only remove bias
###########################################################################
    
    # add new scores to student records
    for i,v in scores.items():
        students[index_to_student[i]].append(v)

    if show_plots:
        scores.hist(bins=50)
        plt.title("{}:  {}".format(qid, cid))
        print(qid, cid)
        #plt.xlim([-4,4])
        plt.show()



In [None]:
# get averages to students_scores based on the dict of lists in students
student_scores = {k: np.mean(v) for k,v in students.items()}

In [None]:
pd.Series(student_scores).hist(bins=100)

# Build a graph and a linear system

In [None]:
ids = df.student_uuid.unique()
students = dict(zip(ids, range(len(ids))))

index_to_student_number = df.student_uuid.map(students)

In [None]:
import itertools
equations = []
last_cid = -1

for (cid, qid), part in df.groupby((df.author_censor_id, df.question_id)):
    
    # show progress
    if last_cid != cid:
        last_cid = cid
        print(cid)
    
    scores = part.score_value
    max_score = scores.max()
    scores = scores.to_frame().join(index_to_student_number.to_frame())
    
    for p in itertools.combinations(scores.iterrows(), 2):
        if np.random.rand() < 0.01:
            st1, st2, val = p[0][1].student_uuid, p[1][1].student_uuid, p[1][1].score_value - p[0][1].score_value
            equations.append( (st1, st2, val, max_score) )


In [None]:
for i in range(0,20):
    print(equations[i])

In [None]:
scores

In [None]:
k = 4000  # number of students to consider

In [None]:
idx = np.arange(index_to_student_number.max())
np.random.shuffle(idx)
random_students = set(idx[:k])
random_student_index = dict(zip(list(random_students), range(k)))

In [None]:
random_equations = [e for e in equations if e[0] in random_students and e[1] in random_students]

In [None]:
n = len(random_equations)
print(n)
X = np.zeros((n, k))
Y = np.zeros((n, ))

for i,(s1,s2,val,w) in enumerate(random_equations):
    X[i,random_student_index[s2]] = w   # student B
    X[i,random_student_index[s1]] = -w  # - student A
    Y[i] = val                          # = gradeB - gradeA

In [None]:
scores = np.linalg.lstsq(X, Y, rcond=0.0001)[0]
scores.min(), scores.max()

In [None]:
scores

In [None]:
pd.Series(scores).hist(bins=100)

In [None]:
data = pd.DataFrame({"idx": range(k), "scores": [int(s*100) for s in scores]})
data.idx = data.idx.map({v:k for k,v in random_student_index.items()})
data["uuid"] = data.idx.map({v:k for k,v in students.items()})
data = data.drop(["idx"], axis=1)
data.head(100)

In [None]:
for alpha in np.logspace(-7, 5, base=10, num=13):
    scores = np.linalg.lstsq(X, Y, rcond=alpha)[0]
    pd.Series(scores).hist(bins=100)
    plt.title("alpha = {}".format(alpha))
    plt.show()

# todo
- true grades of students
- predict grades of students
- sort students by scores 
- compare bins
- compare with school
