In [1]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
from config import Config
from sklearn.model_selection import train_test_split
import gc
from time import *

In [5]:
mode = 'ns'

question_dtypes = {"problem_id": "int32", "tags": "str", "tags_num": "int16"}
if mode == "ns":
    correct_t = "float64"
else:
    correct_t = "int8"
u_dtypes = {'order_id': 'int64', 'user_id': 'int32',
            'problem_id': 'int32', "correct": correct_t,
            "ms_first_response": "int64",  'overlap_time': 'int64'}
print("loading csv.....")
question_df = pd.read_csv(Config.QUESTION_FILE, usecols=[1, 2, 4], dtype=question_dtypes)
# Align the name of key column for latter merging
# 处理tags格式: str -> list[int]
question_df.tags = question_df.tags.str.split(';')
for i in range(len(question_df.tags)):
    question_df.tags.at[i] = np.array(list(map(int, question_df.tags[i])), 'int32')
    question_df.tags.at[i] = np.pad(question_df.tags[i], (0, Config.MAX_CATS_PER - len(question_df.tags[i])),
                                    'constant', constant_values=(0))
print("question file ready")
users_df =pd.read_csv(Config.TRAIN_FILE, dtype=u_dtypes,header=0)
print("user file loaded")
print("shape of dataframe :", users_df.shape)
# AS09中没有elapsed_time 用ms_first_response代替
users_df.ms_first_response.fillna(0, inplace=True)
users_df.ms_first_response = abs(users_df.ms_first_response)
users_df.ms_first_response /= 3600
users_df.ms_first_response = users_df.ms_first_response.astype(np.int64)
if mode == "ns":
    users_df.correct = users_df.correct.round().astype(np.int8)
# Merging train_df and question_df on question_id
users_df = users_df.merge(question_df, on='problem_id', how="left")  # left outer join to consider part
del question_df
gc.collect()
print("user-question file merged")
# 时间排序
users_df = users_df.sort_values(["order_id"], ascending=True).reset_index(drop=True)
print("Computing question difficulty")
df_difficulty = users_df["correct"].groupby(users_df["problem_id"])
users_df["popularity"] = df_difficulty.transform('size')
users_df["difficulty"] = df_difficulty.transform('sum') / users_df["popularity"]
print("Popularity max", users_df["popularity"].max(), ",Difficulty max", users_df["difficulty"].max())
del df_difficulty
gc.collect()

n_questions = users_df.problem_id.nunique()
print("no. of questions :", n_questions)
print("shape after exlusion:", users_df.shape)
# AS09中没有timestamp 用orderid代替
print("Calculating lag time")
time_dict = {}
lag_time_col = np.zeros(len(users_df), dtype=np.int64)
for ind, row in enumerate(tqdm(users_df[["user_id", "order_id", "problem_id"]].values)):
    if row[0] in time_dict.keys():
        # if the task_container_id is the same, the lag time is not allowed
        if row[2] == time_dict[row[0]][1]:
            lag_time_col[ind] = time_dict[row[0]][2]
        else:
            timestamp_last = time_dict[row[0]][0]
            lag_time_col[ind] = row[1] - timestamp_last
            time_dict[row[0]] = (row[1], row[2], lag_time_col[ind])
    else:
        time_dict[row[0]] = (row[1], row[2], 0)
        lag_time_col[ind] = 0
    if lag_time_col[ind] < 0:
        raise RuntimeError("Has lag_time smaller than 0.")
# AS09中用10 100 1000代替
users_df["lag_time_s"] = lag_time_col // 10
users_df["lag_time_m"] = lag_time_col // 100
users_df["lag_time_d"] = lag_time_col // 1000
users_df.lag_time_s.clip(lower=0, upper=5000, inplace=True)
users_df.lag_time_m.clip(lower=0, upper=500, inplace=True)
users_df.lag_time_d.clip(lower=0, upper=50, inplace=True)
users_df.lag_time_s = users_df.lag_time_s.astype(np.int)
users_df.lag_time_m = users_df.lag_time_m.astype(np.int)
users_df.lag_time_d = users_df.lag_time_d.astype(np.int)
del lag_time_col
gc.collect()

# 按用户grouping
# grouping based on user_id to get the data supplu
print("Grouping users...")
group = users_df[["user_id", "problem_id", "correct", "ms_first_response",
                    "tags", "tags_num", "popularity", "difficulty",
                    "lag_time_s", "lag_time_m", "lag_time_d",]] \
    .groupby("user_id") \
    .apply(lambda r: (r.problem_id.values, r.correct.values,
                        r.ms_first_response.values, r.tags.values,
                        r.tags_num.values, r.popularity.values, r.difficulty.values,
                        r.lag_time_s.values, r.lag_time_m.values, r.lag_time_d.values))
n_users = users_df.user_id.nunique()
print("no. of users :", n_users)
del users_df
gc.collect()

loading csv.....
question file ready
user file loaded
shape of dataframe : (603128, 31)
user-question file merged
Computing question difficulty
Popularity max 1187 ,Difficulty max 1.0
no. of questions : 6907
shape after exlusion: (603128, 35)
Calculating lag time


100%|██████████| 603128/603128 [00:00<00:00, 836801.57it/s]
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  users_df.lag_time_s = users_df.lag_time_s.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  users_df.lag_time_m = users_df.lag_time_m.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  users_df.lag_time_d = users_df.lag_time_d.astype(np.int)


Grouping users...
no. of users : 8096


0

In [6]:
print(group.head(10))

user_id
21825    ([34263, 57731], [1, 1], [5, 8], [[62, 0, 0, 0...
23098    ([833, 831, 832, 4514], [0, 1, 1, 0], [3, 7, 1...
26787    ([84889, 84890, 84891, 84892, 84893, 89821, 89...
33121    ([2343, 9033, 4572, 988, 1002, 912, 1516, 997,...
34037    ([24140, 24291, 24050, 24290, 24289, 24270, 24...
36659    ([2289, 2285, 2286, 2287, 2288, 2201, 2197, 21...
51933    ([102067, 2225, 2219, 2225, 839, 14536, 110558...
51950    ([34105, 482, 1597, 1570, 9375, 4673], [0, 0, ...
52574    ([174, 172, 173, 1022, 1018, 1019], [0, 0, 1, ...
53102    ([39708, 39711, 39705], [0, 0, 0], [1, 0, 0], ...
dtype: object


In [9]:
print(group.iloc[0])
'''["user_id", "problem_id", "correct", "ms_first_response",
                    "tags", "tags_num", "popularity", "difficulty",
                    "lag_time_s", "lag_time_m", "lag_time_d",]'''

(array([34263, 57731]), array([1, 1], dtype=int8), array([5, 8], dtype=int64), array([array([62,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
       array([ 9, 12,  0,  0,  0,  0,  0,  0,  0,  0])], dtype=object), array([1, 2], dtype=int16), array([169,  24], dtype=int64), array([0.72189349, 0.58333333]), array([ 0, 13]), array([0, 1]), array([0, 0]))
