In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import numpy as np

In [31]:
# import the datasets
question = pd.read_csv(r'C:\\Users\\kfps86\\Downloads\\dataset\\question_metadata.csv')
answers = pd.read_csv(r'C:\\Users\\kfps86\\Downloads\\dataset\\answers_metadata.csv')
student = pd.read_csv(r'C:\\Users\\kfps86\\Downloads\\dataset\\student_metadata.csv')
subject = pd.read_csv(r'C:\\Users\\kfps86\\Downloads\\dataset\\subject_metadata.csv')
training = pd.read_csv(r'C:\\Users\\kfps86\\Downloads\\dataset\\training.csv')

In [32]:
# create a test_train split
from sklearn.model_selection import train_test_split
# np.random.seed(42)
train_set, test_set = train_test_split(training, test_size=0.2, random_state=42)

In [33]:
answers = answers.dropna(subset=['AnswerId']) 
# 7 values in AnswerId are na (out of 19834820), hence we are droppping those 7 values
answers['AnswerId'] = answers['AnswerId'].astype(int)

In [34]:
# merge the datasets
train_set = train_set.merge(answers , how='inner', on='AnswerId')
train_set = train_set.merge(student, how='inner', on='UserId')
train_set = train_set.merge(question, how='inner', on='QuestionId')

In [35]:
# drop nans
train_set.dropna(inplace=True)

In [36]:
train_set.head()

Unnamed: 0,QuestionId,UserId,AnswerId,IsCorrect,CorrectAnswer,AnswerValue,DateAnswered,Confidence,Gender,DateOfBirth,PremiumPupil,SubjectId
6,27185,40527,10797164,1,1,1,2019-03-18 22:03:00.000,100.0,1,2004-09-01 00:00:00.000,0.0,"[3, 49, 61, 171]"
20,27185,102085,11623933,1,1,1,2019-11-21 20:08:00.000,50.0,1,2005-09-01 00:00:00.000,0.0,"[3, 49, 61, 171]"
24,27185,86279,11606301,0,1,2,2019-02-09 12:45:00.000,50.0,2,2004-07-01 00:00:00.000,1.0,"[3, 49, 61, 171]"
26,27185,55310,15582540,1,1,1,2019-04-22 18:45:00.000,100.0,2,2003-09-01 00:00:00.000,0.0,"[3, 49, 61, 171]"
33,27185,34815,7797864,1,1,1,2019-11-20 18:28:00.000,100.0,2,2006-02-01 00:00:00.000,0.0,"[3, 49, 61, 171]"


In [37]:
len(train_set)

537824

In [40]:
# Data Cleaning
# need to change DateAnswered and DateOfBirth columns to datetime format
train_set['DateAnswered'] = pd.to_datetime(train_set['DateAnswered'], format='%Y-%m-%d %H:%M:%S.%f')
train_set['DateOfBirth'] = pd.to_datetime(train_set['DateOfBirth'], format='%Y-%m-%d %H:%M:%S.%f')

# change SubjectId to list format
train_set['SubjectId'] = train_set['SubjectId'].str.strip('[]').str.split(',')

# https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list
from sklearn.preprocessing import MultiLabelBinarizer
# create a one hot encoding column for each category
# uses up a lot of RAM though
mlb = MultiLabelBinarizer(sparse_output=True)

train_set = train_set.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(train_set.pop('SubjectId')),
                index=train_set.index,
                columns=mlb.classes_))

In [158]:
# based on: https://github.com/ageron/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb
from sklearn.base import BaseEstimator, TransformerMixin
# # column index
# rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

# col_names = "total_rooms", "total_bedrooms", "population", "households"
# rooms_ix, bedrooms_ix, population_ix, households_ix = [
#     housing.columns.get_loc(c) for c in col_names] # get the column indices

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
#     def __init__(self): # no *args or **kargs
    def fit(self, train_set, y=None):
        return self  # nothing else to do
    def transform(self, train_set):
        
        # 'total_answered',
        train_set['total_answered'] = train_set.groupby(['UserId'])['IsCorrect'].transform('count')

        # 'prop_correct',
        train_set['total_correct'] = train_set.groupby(['UserId'])['IsCorrect'].transform('sum')
        train_set['prop_correct'] = train_set['total_correct'] / train_set['total_answered']
#         train_set.drop('total_correct', inplace=True)

        # CMA
        train_set.sort_values(['UserId', 'DateAnswered'], inplace=True)
        CMA = train_set.groupby(['UserId']).IsCorrect.expanding().mean()
        train_set['CMA'] = CMA.reset_index(level=0, drop=True)

        # 'total_q_answered',
        train_set['total_q_answered'] = train_set.groupby(['QuestionId'])['QuestionId'].transform('count')

        # lvl2 - needs SubjectId first
        train_set['lvl2'] = 0
        for i in [' 101', ' 1156', ' 119', ' 149', ' 151', ' 32', ' 49', ' 692', ' 71']:
            if i in train_set.columns.tolist():
                i_int = (int(i[1:]))
                train_set['lvl2'] = train_set['lvl2'] + (train_set[i] * i_int)

        # CMA_correct_subject - need lvl2 first
        CMA_correct_subject = train_set.groupby(['UserId', 'lvl2']).IsCorrect.expanding().mean()
        train_set['CMA_correct_subject'] = CMA_correct_subject.reset_index(level=[0,1], drop=True)

        # 'holiday',
        train_set['holiday'] = 1
        train_set.loc[((train_set['DateAnswered'] < '2018-10-20') & (train_set['DateAnswered'] > '2018-09-03')) |
              ((train_set['DateAnswered'] > '2018-10-28') & (train_set['DateAnswered'] < '2018-12-20')) |
              ((train_set['DateAnswered'] > '2019-01-02') & (train_set['DateAnswered'] < '2019-02-16')) |
              ((train_set['DateAnswered'] > '2019-02-24') & (train_set['DateAnswered'] < '2019-04-06')) |
              ((train_set['DateAnswered'] > '2019-04-22') & (train_set['DateAnswered'] < '2019-05-25')) |
              ((train_set['DateAnswered'] > '2019-06-02') & (train_set['DateAnswered'] < '2019-07-25')) |

              ((train_set['DateAnswered'] > '2019-09-01') & (train_set['DateAnswered'] < '2019-10-19')) |
              ((train_set['DateAnswered'] > '2019-10-27') & (train_set['DateAnswered'] < '2019-12-20')) |
              ((train_set['DateAnswered'] > '2020-01-05') & (train_set['DateAnswered'] < '2020-02-15')) |
              ((train_set['DateAnswered'] > '2020-02-23') & (train_set['DateAnswered'] < '2020-04-03')) |
              ((train_set['DateAnswered'] > '2020-04-19') & (train_set['DateAnswered'] < '2020-05-23')) |
              ((train_set['DateAnswered'] > '2020-05-31') & (train_set['DateAnswered'] < '2020-07-23')) 
              ,'holiday'] = 0

        train_set['help'] = 0
        print('1')
        # 'unique_day',
        unique_student_train = pd.DataFrame(data=train_set['UserId'].unique(), columns=['UserId'])
        print('2')
        unique_student_train['unique_day'] = 0
        print('3')
        for i in range(len(unique_student_train)):
                unique_student_train.iloc[i, 1] =  len(train_set.loc[train_set['UserId']==unique_student_train.iloc[i, 0]]['DateAnswered'].dt.normalize().unique())
        print('4')
        train_set = train_set.merge(unique_student_train, how='inner', on='UserId')
        print('5')
        del unique_student_train
        import gc
        gc.collect()

        # 'yr2',
        train_set['yr2'] = 1
        train_set.loc[(train_set['DateAnswered'] < '2019-09-01'), 'yr2'] = 0

        # 'age',
        train_set['age'] = train_set['DateAnswered'] - train_set['DateOfBirth'] 

        # 'term',
        train_set['term'] = 6

        train_set.loc[((train_set['DateAnswered'] >= '2018-09-04') & (train_set['DateAnswered'] < '2018-10-29')) |
                      ((train_set['DateAnswered'] >= '2019-09-02') & (train_set['DateAnswered'] < '2019-10-28')),
                      'term'] = 1

        train_set.loc[((train_set['DateAnswered'] >= '2018-10-29') & (train_set['DateAnswered'] < '2019-01-03')) |
                      ((train_set['DateAnswered'] >= '2019-10-28') & (train_set['DateAnswered'] < '2020-01-06')),
                      'term'] = 2

        train_set.loc[((train_set['DateAnswered'] >= '2019-01-03') & (train_set['DateAnswered'] < '2019-02-25')) |
                      ((train_set['DateAnswered'] >= '2020-01-06') & (train_set['DateAnswered'] < '2020-02-24')),
                      'term'] = 3

        train_set.loc[((train_set['DateAnswered'] >= '2019-02-25') & (train_set['DateAnswered'] < '2019-04-23')) |
                      ((train_set['DateAnswered'] >= '2020-02-24') & (train_set['DateAnswered'] < '2020-04-20')),
                      'term'] = 4

        train_set.loc[((train_set['DateAnswered'] >= '2019-04-23') & (train_set['DateAnswered'] < '2019-06-03')) |
                      ((train_set['DateAnswered'] >= '2020-04-20') & (train_set['DateAnswered'] < '2020-06-01')),
                      'term'] = 5

        # 'time',
        train_set['time'] = 4
        train_set.loc[(train_set['DateAnswered'].dt.strftime("%H:%M:%S") >= '08:00:00') &
                      (train_set['DateAnswered'].dt.strftime("%H:%M:%S") < '12:00:00')
                       , 'time'] = 1

        train_set.loc[(train_set['DateAnswered'].dt.strftime("%H:%M:%S") >= '12:00:00') &
                      (train_set['DateAnswered'].dt.strftime("%H:%M:%S") < '16:00:00')
                       , 'time'] = 2

        train_set.loc[(train_set['DateAnswered'].dt.strftime("%H:%M:%S") >= '16:00:00') &
                      (train_set['DateAnswered'].dt.strftime("%H:%M:%S") < '20:00:00')
                       , 'time'] = 3

        # 'is_weekend',
        train_set['is_weekend'] = 0
        train_set.loc[train_set['DateAnswered'].dt.dayofweek > 4, 'is_weekend'] = 1

        # 'last_answered', adds repeat as well
        train_set.sort_values(['UserId', 'DateAnswered'], inplace=True)
        train_set['last_answered'] = train_set['DateAnswered'] - datetime.datetime.strptime('2018-09-01 00:00:00', '%Y-%m-%d %H:%M:%S')
        train_set['repeat'] = (train_set['UserId']==train_set['UserId'].shift(1))
        train_set.loc[train_set['repeat'] == True, 'last_answered'] = train_set['DateAnswered'].diff()
        
        return train_set

attr_adder = CombinedAttributesAdder()
# training_extra_attribs = attr_adder.transform(train_set)


In [142]:
# prep data for ML algos
# they use strat_train_set - think should do this based on confidence value
# getting equal missing values
IsCorrect = train_set.drop('IsCorrect', axis=1)
IsCorrect_labels = train_set['IsCorrect'].copy()
AnswerValue = train_set.drop('AnswerValue', axis=1)
AnswerValue_labels = train_set['AnswerValue'].copy()

In [164]:
training_extra_attribs

Unnamed: 0,QuestionId,UserId,AnswerId,IsCorrect,CorrectAnswer,AnswerValue,DateAnswered,Confidence,Gender,DateOfBirth,PremiumPupil,100,101,102,103,104,105,1059,106,107,1077,1078,1079,108,1080,1081,1082,109,110,111,112,113,114,115,1156,1157,1158,1159,116,1160,1161,1162,1163,1164,1165,1167,1169,117,1171,1174,1175,1176,1179,118,1180,1181,1182,1184,1185,1186,1187,1188,119,1203,1208,1209,1210,1212,1213,1214,1215,1218,1263,1265,1266,141,144,146,149,152,153,154,156,157,158,159,160,163,1636,164,1642,1647,1648,1649,165,1650,1651,166,167,1676,168,171,172,173,174,175,1750,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,195,196,197,1975,198,1982,199,200,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,298,32,33,331,332,334,335,336,337,338,339,34,340,341,342,343,344,348,349,35,350,353,36,37,38,39,40,406,408,409,41,410,42,434,436,437,439,44,45,46,47,48,49,50,51,52,53,54,540,55,56,57,58,59,60,61,62,63,64,649,65,655,656,657,66,67,68,69,692,698,70,700,71,72,73,74,75,76,77,78,79,80,81,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,3,total_answered,total_correct,prop_correct,CMA,total_q_answered,lvl2,CMA_correct_subject,holiday,unique_day,yr2,age,term,time,is_weekend,last_answered,repeat
0,7890,1,4141928,1,1,1,2019-09-12 20:12:00,100.0,2,2006-04-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,53,43,0.811321,1.000000,132,32.0,1.000000,0,10,1,4912 days 20:12:00,1,4,0,376 days 20:12:00,False
1,12630,1,13244064,1,4,4,2019-09-12 20:12:00,100.0,2,2006-04-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,53,43,0.811321,1.000000,123,32.0,1.000000,0,10,1,4912 days 20:12:00,1,4,0,0 days 00:00:00,True
2,12341,1,3100319,1,3,3,2019-09-26 19:23:00,100.0,2,2006-04-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,53,43,0.811321,1.000000,97,32.0,1.000000,0,10,1,4926 days 19:23:00,1,3,0,13 days 23:11:00,True
3,13548,1,12298367,1,1,1,2019-09-26 19:24:00,100.0,2,2006-04-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,53,43,0.811321,1.000000,108,32.0,1.000000,0,10,1,4926 days 19:24:00,1,3,0,0 days 00:01:00,True
4,16957,1,2389063,0,3,2,2019-09-26 19:25:00,100.0,2,2006-04-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,53,43,0.811321,0.800000,87,32.0,0.800000,0,10,1,4926 days 19:25:00,1,3,0,0 days 00:01:00,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537819,27582,118970,8426653,0,1,3,2019-05-02 22:45:00,0.0,2,2004-07-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,26,12,0.461538,0.545455,15,49.0,0.600000,0,10,0,5418 days 22:45:00,5,4,0,6 days 23:57:00,True
537820,7124,118970,6972744,0,3,2,2019-05-02 22:55:00,100.0,2,2004-07-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,26,12,0.461538,0.521739,64,49.0,0.500000,0,10,0,5418 days 22:55:00,5,4,0,0 days 00:10:00,True
537821,14597,118970,6821260,0,1,4,2019-07-23 08:03:00,50.0,2,2004-07-01,0.0,0,101.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,26,12,0.461538,0.500000,222,10201.0,0.000000,0,10,0,5500 days 08:03:00,6,1,0,81 days 09:08:00,True
537822,5629,118970,2449328,0,1,2,2020-02-23 15:11:00,50.0,2,2004-07-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,26,12,0.461538,0.480000,11,71.0,0.461538,0,10,1,5715 days 15:11:00,3,2,1,215 days 07:08:00,True


In [169]:
# split columns into numerical and categorical
cat_attribs = ['QuestionId', 'UserId', 'AnswerId','CorrectAnswer', 'AnswerValue', 'Confidence', 'Gender', 'PremiumPupil', 'time', 'term']
training_num = training_extra_attribs.drop(cat_attribs, axis=1)
training_num['unique_day'] = 0
num_attribs = list(training_num)
all_attribs = list(train_set)

In [None]:
training_num

housing_num

In [None]:
integer_features = X.select_dtypes(exclude="object").columns


In [145]:
training_extra_attribs

Unnamed: 0,QuestionId,UserId,AnswerId,IsCorrect,CorrectAnswer,AnswerValue,DateAnswered,Confidence,Gender,DateOfBirth,PremiumPupil,100,101,102,103,104,105,1059,106,107,1077,1078,1079,108,1080,1081,1082,109,110,111,112,113,114,115,1156,1157,1158,1159,116,1160,1161,1162,1163,1164,1165,1167,1169,117,1171,1174,1175,1176,1179,118,1180,1181,1182,1184,1185,1186,1187,1188,119,1203,1208,1209,1210,1212,1213,1214,1215,1218,1263,1265,1266,141,144,146,149,152,153,154,156,157,158,159,160,163,1636,164,1642,1647,1648,1649,165,1650,1651,166,167,1676,168,171,172,173,174,175,1750,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,195,196,197,1975,198,1982,199,200,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,298,32,33,331,332,334,335,336,337,338,339,34,340,341,342,343,344,348,349,35,350,353,36,37,38,39,40,406,408,409,41,410,42,434,436,437,439,44,45,46,47,48,49,50,51,52,53,54,540,55,56,57,58,59,60,61,62,63,64,649,65,655,656,657,66,67,68,69,692,698,70,700,71,72,73,74,75,76,77,78,79,80,81,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,3,total_answered,total_correct,prop_correct,CMA,total_q_answered,lvl2,CMA_correct_subject,holiday,unique_day,yr2,age,term,time,is_weekend,last_answered,repeat
0,7890,1,4141928,1,1,1,2019-09-12 20:12:00,100.0,2,2006-04-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,53,43,0.811321,1.000000,132,32.0,1.000000,0,10,1,4912 days 20:12:00,1,4,0,376 days 20:12:00,False
1,12630,1,13244064,1,4,4,2019-09-12 20:12:00,100.0,2,2006-04-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,53,43,0.811321,1.000000,123,32.0,1.000000,0,10,1,4912 days 20:12:00,1,4,0,0 days 00:00:00,True
2,12341,1,3100319,1,3,3,2019-09-26 19:23:00,100.0,2,2006-04-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,53,43,0.811321,1.000000,97,32.0,1.000000,0,10,1,4926 days 19:23:00,1,3,0,13 days 23:11:00,True
3,13548,1,12298367,1,1,1,2019-09-26 19:24:00,100.0,2,2006-04-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,53,43,0.811321,1.000000,108,32.0,1.000000,0,10,1,4926 days 19:24:00,1,3,0,0 days 00:01:00,True
4,16957,1,2389063,0,3,2,2019-09-26 19:25:00,100.0,2,2006-04-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,53,43,0.811321,0.800000,87,32.0,0.800000,0,10,1,4926 days 19:25:00,1,3,0,0 days 00:01:00,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537819,27582,118970,8426653,0,1,3,2019-05-02 22:45:00,0.0,2,2004-07-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,26,12,0.461538,0.545455,15,49.0,0.600000,0,10,0,5418 days 22:45:00,5,4,0,6 days 23:57:00,True
537820,7124,118970,6972744,0,3,2,2019-05-02 22:55:00,100.0,2,2004-07-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,26,12,0.461538,0.521739,64,49.0,0.500000,0,10,0,5418 days 22:55:00,5,4,0,0 days 00:10:00,True
537821,14597,118970,6821260,0,1,4,2019-07-23 08:03:00,50.0,2,2004-07-01,0.0,0,101.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,26,12,0.461538,0.500000,222,10201.0,0.000000,0,10,0,5500 days 08:03:00,6,1,0,81 days 09:08:00,True
537822,5629,118970,2449328,0,1,2,2020-02-23 15:11:00,50.0,2,2004-07-01,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,26,12,0.461538,0.480000,11,71.0,0.461538,0,10,1,5715 days 15:11:00,3,2,1,215 days 07:08:00,True


In [160]:
unique_student_train = pd.DataFrame(data=train_set['UserId'].unique(), columns=['UserId'])
print('2')
unique_student_train['unique_day'] = 0

2


In [173]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
#         ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
#         ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
#         ('all', CombinedAttributesAdder(), all_attribs),
#         ('num', StandardScaler(), num_attribs),
        ('num', num_pipeline, num_attribs),
        ('cat', OneHotEncoder(), cat_attribs),
    ])

# add the columns to IsCorrect
IsCorrect['total_answered'] = ''
IsCorrect['prop_correct'] = ''
IsCorrect['CMA'] = ''
IsCorrect['total_q_answered'] = ''
IsCorrect['lvl2'] = ''
IsCorrect['CMA_correct_subject'] = ''
IsCorrect['holiday'] = ''
IsCorrect['unique_day'] = ''
IsCorrect['yr2'] = ''
IsCorrect['age'] = ''
IsCorrect['term'] = ''
IsCorrect['time'] = ''
IsCorrect['is_weekend'] = ''
IsCorrect['last_answered'] = ''
IsCorrect['repeat'] = ''

IsCorrect_prepared = full_pipeline.fit_transform(IsCorrect)

  IsCorrect['unique_day'] = ''
  IsCorrect['yr2'] = ''
  IsCorrect['age'] = ''
  IsCorrect['term'] = ''
  IsCorrect['time'] = ''
  IsCorrect['is_weekend'] = ''
  IsCorrect['last_answered'] = ''
  IsCorrect['repeat'] = ''


ValueError: A given column is not a column of the dataframe

# 20.01 Lucy
start again above
unsure what is happeneing with the key error
try to run book code https://github.com/ageron/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb
see if that helps
once pipeline works then we're looking a lot better

think we're gonna use random forest but maybe do some more research

In [125]:
num_attribs = list(train_set)
num_attribs.remove(['QuestionId', 'UserId', 'AnswerId', 'Confidence', 'Gender', 'PremiumPupil', 'time', 'term'])
num_attribs


['DateAnswered', 'DateOfBirth', ' 100', ' 101', ' 102', ' 103', ' 104', ' 105',
 ' 1059', ' 106', ' 107', ' 1077', ' 1078', ' 1079', ' 108', ' 1080', ' 1081',
 ' 1082', ' 109', ' 110', ' 111', ' 112', ' 113', ' 114', ' 115', ' 1156',
 ' 1157', ' 1158', ' 1159', ' 116', ' 1160', ' 1161', ' 1162', ' 1163', ' 1164',
 ' 1165', ' 1167', ' 1169', ' 117', ' 1171', ' 1174', ' 1175', ' 1176', ' 1179',
 ' 118', ' 1180', ' 1181', ' 1182', ' 1184', ' 1185', ' 1186', ' 1187', ' 1188',
 ' 119', ' 1203', ' 1208', ' 1209', ' 1210', ' 1212', ' 1213', ' 1214', ' 1215',
 ' 1218', ' 1263', ' 1265', ' 1266', ' 141', ' 144', ' 146', ' 149', ' 152',
 ' 153', ' 154', ' 156', ' 157', ' 158', ' 159', ' 160', ' 163', ' 1636',
 ' 164', ' 1642', ' 1647', ' 1648', ' 1649', ' 165', ' 1650', ' 1651', ' 166', ' 167',
 ' 1676',
 ' 168',
 ' 171',
 ' 172',
 ' 173',
 ' 174',
 ' 175',
 ' 1750',
 ' 176',
 ' 177',
 ' 178',
 ' 179',
 ' 180',
 ' 181',
 ' 182',
 ' 183',
 ' 184',
 ' 185',
 ' 186',
 ' 187',
 ' 188',
 ' 189',
 ' 190',
 ' 191',
 ' 192',
 ' 193',
 ' 195',
 ' 196',
 ' 197',
 ' 1975',
 ' 198',
 ' 1982',
 ' 199',
 ' 200',
 ' 202',
 ' 203',
 ' 204',
 ' 205',
 ' 206',
 ' 207',
 ' 208',
 ' 209',
 ' 210',
 ' 211',
 ' 212',
 ' 213',
 ' 214',
 ' 215',
 ' 216',
 ' 217',
 ' 218',
 ' 219',
 ' 220',
 ' 221',
 ' 222',
 ' 223',
 ' 224',
 ' 225',
 ' 226',
 ' 227',
 ' 228',
 ' 229',
 ' 230',
 ' 231',
 ' 232',
 ' 233',
 ' 234',
 ' 235',
 ' 236',
 ' 237',
 ' 238',
 ' 239',
 ' 240',
 ' 241',
 ' 242',
 ' 243',
 ' 244',
 ' 245',
 ' 246',
 ' 247',
 ' 248',
 ' 249',
 ' 250',
 ' 251',
 ' 252',
 ' 253',
 ' 254',
 ' 255',
 ' 256',
 ' 257',
 ' 258',
 ' 259',
 ' 260',
 ' 261',
 ' 262',
 ' 263',
 ' 264',
 ' 265',
 ' 266',
 ' 267',
 ' 268',
 ' 269',
 ' 270',
 ' 271',
 ' 272',
 ' 273',
 ' 274',
 ' 275',
 ' 276',
 ' 277',
 ' 278',
 ' 279',
 ' 280',
 ' 281',
 ' 282',
 ' 283',
 ' 284',
 ' 298',
 ' 32',
 ' 33',
 ' 331',
 ' 332',
 ' 334',
 ' 335',
 ' 336',
 ' 337',
 ' 338',
 ' 339',
 ' 34',
 ' 340',
 ' 341',
 ' 342',
 ' 343',
 ' 344',
 ' 348',
 ' 349',
 ' 35',
 ' 350',
 ' 353',
 ' 36',
 ' 37',
 ' 38',
 ' 39',
 ' 40',
 ' 406',
 ' 408',
 ' 409',
 ' 41',
 ' 410',
 ' 42',
 ' 434',
 ' 436',
 ' 437',
 ' 439',
 ' 44',
 ' 45',
 ' 46',
 ' 47',
 ' 48',
 ' 49',
 ' 50',
 ' 51',
 ' 52',
 ' 53',
 ' 54',
 ' 540',
 ' 55',
 ' 56',
 ' 57',
 ' 58',
 ' 59',
 ' 60',
 ' 61',
 ' 62',
 ' 63',
 ' 64',
 ' 649',
 ' 65',
 ' 655',
 ' 656',
 ' 657',
 ' 66',
 ' 67',
 ' 68',
 ' 69',
 ' 692',
 ' 698',
 ' 70',
 ' 700',
 ' 71',
 ' 72',
 ' 73',
 ' 74',
 ' 75',
 ' 76',
 ' 77',
 ' 78',
 ' 79',
 ' 80',
 ' 81',
 ' 83',
 ' 84',
 ' 85',
 ' 86',
 ' 87',
 ' 88',
 ' 89',
 ' 90',
 ' 91',
 ' 92',
 ' 93',
 ' 94',
 ' 95',
 ' 96',
 ' 97',
 ' 98',
 ' 99',
 '3',
 'total_answered',
 'total_correct',
 'prop_correct',
 'CMA',
 'total_q_answered',
 'lvl2',
 'CMA_correct_subject',
 'holiday']

ValueError: list.remove(x): x not in list

In [140]:
# 'total_answered',
#         train_set['total_answered'] = train_set.groupby(['UserId'])['IsCorrect'].transform('count')

#         # 'prop_correct',
#         train_set['total_correct'] = train_set.groupby(['UserId'])['IsCorrect'].transform('sum')
#         train_set['prop_correct'] = train_set['total_correct'] / train_set['total_answered']
# #         train_set.drop('total_correct', inplace=True)
                
#         # CMA
#         train_set.sort_values(['UserId', 'DateAnswered'], inplace=True)
#         CMA = train_set.groupby(['UserId']).IsCorrect.expanding().mean()
#         train_set['CMA'] = CMA.reset_index(level=0, drop=True)
        
#         # 'total_q_answered',
#         train_set['total_q_answered'] = train_set.groupby(['QuestionId'])['QuestionId'].transform('count')

# # lvl2 - needs SubjectId first
# train_set['lvl2'] = 0
# for i in [' 101', ' 1156', ' 119', ' 149', ' 151', ' 32', ' 49', ' 692', ' 71']:
#     if i in train_set.columns.tolist():
#         i_int = (int(i[1:]))
#         train_set['lvl2'] = train_set['lvl2'] + (train_set[i] * i_int)
        
# # CMA_correct_subject - need lvl2 first
# CMA_correct_subject = train_set.groupby(['UserId', 'lvl2']).IsCorrect.expanding().mean()
# train_set['CMA_correct_subject'] = CMA_correct_subject.reset_index(level=[0,1], drop=True)

# # 'holiday',
# train_set['holiday'] = 1
# train_set.loc[((train_set['DateAnswered'] < '2018-10-20') & (train_set['DateAnswered'] > '2018-09-03')) |
#       ((train_set['DateAnswered'] > '2018-10-28') & (train_set['DateAnswered'] < '2018-12-20')) |
#       ((train_set['DateAnswered'] > '2019-01-02') & (train_set['DateAnswered'] < '2019-02-16')) |
#       ((train_set['DateAnswered'] > '2019-02-24') & (train_set['DateAnswered'] < '2019-04-06')) |
#       ((train_set['DateAnswered'] > '2019-04-22') & (train_set['DateAnswered'] < '2019-05-25')) |
#       ((train_set['DateAnswered'] > '2019-06-02') & (train_set['DateAnswered'] < '2019-07-25')) |

#       ((train_set['DateAnswered'] > '2019-09-01') & (train_set['DateAnswered'] < '2019-10-19')) |
#       ((train_set['DateAnswered'] > '2019-10-27') & (train_set['DateAnswered'] < '2019-12-20')) |
#       ((train_set['DateAnswered'] > '2020-01-05') & (train_set['DateAnswered'] < '2020-02-15')) |
#       ((train_set['DateAnswered'] > '2020-02-23') & (train_set['DateAnswered'] < '2020-04-03')) |
#       ((train_set['DateAnswered'] > '2020-04-19') & (train_set['DateAnswered'] < '2020-05-23')) |
#       ((train_set['DateAnswered'] > '2020-05-31') & (train_set['DateAnswered'] < '2020-07-23')) 
#       ,'holiday'] = 0

# 'unique_day',
unique_student_train = pd.DataFrame(data=train_set['UserId'].unique(), columns=['UserId'])
unique_student_train['unique_day'] = 0
for i in range(len(unique_student_train)):
        unique_student_train.iloc[i, 1] =  len(train_set.loc[train_set['UserId']==unique_student_train.iloc[i, 0]]['DateAnswered'].dt.normalize().unique())
train_set = train_set.merge(unique_student_train, how='inner', on='UserId')
del unique_student_train
import gc
gc.collect()

# 'yr2',
train_set['yr2'] = 1
train_set.loc[(train_set['DateAnswered'] < '2019-09-01'), 'yr2'] = 0

# 'age',
train_set['age'] = train_set['DateAnswered'] - train_set['DateOfBirth'] 

# 'term',
train_set['term'] = 6

train_set.loc[((train_set['DateAnswered'] >= '2018-09-04') & (train_set['DateAnswered'] < '2018-10-29')) |
              ((train_set['DateAnswered'] >= '2019-09-02') & (train_set['DateAnswered'] < '2019-10-28')),
              'term'] = 1

train_set.loc[((train_set['DateAnswered'] >= '2018-10-29') & (train_set['DateAnswered'] < '2019-01-03')) |
              ((train_set['DateAnswered'] >= '2019-10-28') & (train_set['DateAnswered'] < '2020-01-06')),
              'term'] = 2

train_set.loc[((train_set['DateAnswered'] >= '2019-01-03') & (train_set['DateAnswered'] < '2019-02-25')) |
              ((train_set['DateAnswered'] >= '2020-01-06') & (train_set['DateAnswered'] < '2020-02-24')),
              'term'] = 3

train_set.loc[((train_set['DateAnswered'] >= '2019-02-25') & (train_set['DateAnswered'] < '2019-04-23')) |
              ((train_set['DateAnswered'] >= '2020-02-24') & (train_set['DateAnswered'] < '2020-04-20')),
              'term'] = 4

train_set.loc[((train_set['DateAnswered'] >= '2019-04-23') & (train_set['DateAnswered'] < '2019-06-03')) |
              ((train_set['DateAnswered'] >= '2020-04-20') & (train_set['DateAnswered'] < '2020-06-01')),
              'term'] = 5

# 'time',
train_set['time'] = 4
train_set.loc[(train_set['DateAnswered'].dt.strftime("%H:%M:%S") >= '08:00:00') &
              (train_set['DateAnswered'].dt.strftime("%H:%M:%S") < '12:00:00')
               , 'time'] = 1

train_set.loc[(train_set['DateAnswered'].dt.strftime("%H:%M:%S") >= '12:00:00') &
              (train_set['DateAnswered'].dt.strftime("%H:%M:%S") < '16:00:00')
               , 'time'] = 2

train_set.loc[(train_set['DateAnswered'].dt.strftime("%H:%M:%S") >= '16:00:00') &
              (train_set['DateAnswered'].dt.strftime("%H:%M:%S") < '20:00:00')
               , 'time'] = 3

# 'is_weekend',
train_set['is_weekend'] = 0
train_set.loc[train_set['DateAnswered'].dt.dayofweek > 4, 'is_weekend'] = 1

# 'last_answered', adds repeat as well
train_set.sort_values(['UserId', 'DateAnswered'], inplace=True)
train_set['last_answered'] = train_set['DateAnswered'] - datetime.datetime.strptime('2018-09-01 00:00:00', '%Y-%m-%d %H:%M:%S')
train_set['repeat'] = (train_set['UserId']==train_set['UserId'].shift(1))
train_set.loc[train_set['repeat'] == True, 'last_answered'] = train_set['DateAnswered'].diff() 

KeyboardInterrupt: 

In [None]:
# source: lvl 2 book pg 59
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(m_training, test_size=0.2, random_state=42)