# SuperMemo Data Converter

[![open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/open-spaced-repetition/fsrs-vs-sm18/blob/main/convert.ipynb)

This notebook converts the SM-18 repetition history to the same format as FSRS.

In [1]:
import pandas as pd
import csv

filename = 'Repetition History-Leee-2023-08-09'

# Function to extract keys and values from a line
def extract_keys_values(line):
    parts = line.strip().split(" ")
    keys, values = [], []
    for part in parts:
        key, value = part.split("=")
        keys.append(key)
        values.append(value)
    return keys, values

# Path to the input TXT file
txt_file_path = f'{filename}.txt'

# Extracting all unique keys from the data to form the headers
unique_keys = set()
with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
    for line in txt_file:
        if line.startswith("ElNo"):
            keys, _ = extract_keys_values(line)
            unique_keys.update(keys)

# Sorting the keys to maintain a consistent order
headers = sorted(list(unique_keys))

# Path to the output CSV file
csv_file_path = f'{filename}.csv'

# Writing the data to CSV with the identified headers and considering missing values
with open(txt_file_path, 'r', encoding='utf-8') as txt_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(headers)  # Write the headers

    for line in txt_file:
        if line.startswith("ElNo"):
            keys, values = extract_keys_values(line)

            # Consider missing values by aligning with headers
            row = [None] * len(headers)
            for key, value in zip(keys, values):
                index = headers.index(key)
                row[index] = value

            writer.writerow(row)

# Reading the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Converting the "Date" column to standard date format
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True).dt.strftime('%Y-%m-%d')

# Sorting the DataFrame by "ElNo" and "Date" (ascending for both)
df = df.sort_values(by=['ElNo', 'Date'], ascending=[True, True])

In [2]:
print(df.shape)
print(df['ElNo'].unique().shape)
df.head()

(63191, 11)
(13684,)


Unnamed: 0,Date,Difficulty,ElNo,Grade,Hour,Int,Laps,Postpones,Priority,Rep,expFI
9,2022-04-22,,6,8,8.187,0,0,,0.2998,1,99.0
8,2022-04-23,0.0,6,4,7.869,1,0,,0.2954,2,14.0
7,2022-04-25,0.705,6,4,12.311,2,0,,5.1183,3,1.0
6,2022-05-23,0.668,6,3,8.816,28,0,,5.3459,4,7.0
5,2022-12-29,1.0,6,1,15.371,220,1,,1.2276,1,9.0


In [3]:
df.drop(["Difficulty", "Hour", "Postpones", "Priority"], axis=1, inplace=True, errors="ignore")
df.head()

Unnamed: 0,Date,ElNo,Grade,Int,Laps,Rep,expFI
9,2022-04-22,6,8,0,0,1,99.0
8,2022-04-23,6,4,1,0,2,14.0
7,2022-04-25,6,4,2,0,3,1.0
6,2022-05-23,6,3,28,0,4,7.0
5,2022-12-29,6,1,220,1,1,9.0


In [4]:
df = df.groupby('ElNo').filter(lambda group: (group['Rep'].iloc[0] == 1) and (group['Laps'].iloc[0] == 0))
print(df.shape)
df.head()

(62239, 7)


Unnamed: 0,Date,ElNo,Grade,Int,Laps,Rep,expFI
9,2022-04-22,6,8,0,0,1,99.0
8,2022-04-23,6,4,1,0,2,14.0
7,2022-04-25,6,4,2,0,3,1.0
6,2022-05-23,6,3,28,0,4,7.0
5,2022-12-29,6,1,220,1,1,9.0


In [5]:
df.drop(df[df['expFI'] == 99].index, inplace=True)
print(df.shape)
df.head()

(47977, 7)


Unnamed: 0,Date,ElNo,Grade,Int,Laps,Rep,expFI
8,2022-04-23,6,4,1,0,2,14.0
7,2022-04-25,6,4,2,0,3,1.0
6,2022-05-23,6,3,28,0,4,7.0
5,2022-12-29,6,1,220,1,1,9.0
4,2022-12-30,6,4,1,1,2,8.0


In [6]:
df = df.groupby('ElNo').filter(lambda group: (group['Grade'] > 5).sum() == 0)
print(df.shape)
df.head()

(40820, 7)


Unnamed: 0,Date,ElNo,Grade,Int,Laps,Rep,expFI
8,2022-04-23,6,4,1,0,2,14.0
7,2022-04-25,6,4,2,0,3,1.0
6,2022-05-23,6,3,28,0,4,7.0
5,2022-12-29,6,1,220,1,1,9.0
4,2022-12-30,6,4,1,1,2,8.0


In [7]:
df.drop_duplicates(subset=['Date', 'ElNo'], keep='first', inplace=True)
print(df.shape)
df.head()

(40819, 7)


Unnamed: 0,Date,ElNo,Grade,Int,Laps,Rep,expFI
8,2022-04-23,6,4,1,0,2,14.0
7,2022-04-25,6,4,2,0,3,1.0
6,2022-05-23,6,3,28,0,4,7.0
5,2022-12-29,6,1,220,1,1,9.0
4,2022-12-30,6,4,1,1,2,8.0


In [8]:
df['i'] = df.groupby('ElNo').cumcount() + 1
df.head()

Unnamed: 0,Date,ElNo,Grade,Int,Laps,Rep,expFI,i
8,2022-04-23,6,4,1,0,2,14.0,1
7,2022-04-25,6,4,2,0,3,1.0,2
6,2022-05-23,6,3,28,0,4,7.0,3
5,2022-12-29,6,1,220,1,1,9.0,4
4,2022-12-30,6,4,1,1,2,8.0,5


In [9]:
df['Date'] = pd.to_datetime(df['Date'])
df['delta_t'] = df['Date'].diff().dt.days
df['delta_t'] = df['delta_t'].fillna(0)
df.loc[df['i'] == 1, 'delta_t'] = 0
df['delta_t'] = df['delta_t'].astype(int)
df.head()

Unnamed: 0,Date,ElNo,Grade,Int,Laps,Rep,expFI,i,delta_t
8,2022-04-23,6,4,1,0,2,14.0,1,0
7,2022-04-25,6,4,2,0,3,1.0,2,2
6,2022-05-23,6,3,28,0,4,7.0,3,28
5,2022-12-29,6,1,220,1,1,9.0,4,220
4,2022-12-30,6,4,1,1,2,8.0,5,1


In [10]:
df.rename(columns={'ElNo': 'card_id', 'Date': 'review_date'}, inplace=True)
df['review_rating'] = df['Grade'].map({1: 1, 2: 1, 3: 2, 4: 3, 5: 4})
df['review_time'] = 0
df.head()

Unnamed: 0,review_date,card_id,Grade,Int,Laps,Rep,expFI,i,delta_t,review_rating,review_time
8,2022-04-23,6,4,1,0,2,14.0,1,0,3,0
7,2022-04-25,6,4,2,0,3,1.0,2,2,3,0
6,2022-05-23,6,3,28,0,4,7.0,3,28,2,0
5,2022-12-29,6,1,220,1,1,9.0,4,220,1,0
4,2022-12-30,6,4,1,1,2,8.0,5,1,3,0


In [11]:
from itertools import accumulate

def cum_concat(x):
    return list(accumulate(x))

t_history = df.groupby('card_id', group_keys=False)['delta_t'].apply(lambda x: cum_concat([[int(i)] for i in x]))
df['t_history']=[','.join(map(str, item[:-1])) for sublist in t_history for item in sublist]
r_history = df.groupby('card_id', group_keys=False)['review_rating'].apply(lambda x: cum_concat([[i] for i in x]))
df['r_history']=[','.join(map(str, item[:-1])) for sublist in r_history for item in sublist]
df.head()

Unnamed: 0,review_date,card_id,Grade,Int,Laps,Rep,expFI,i,delta_t,review_rating,review_time,t_history,r_history
8,2022-04-23,6,4,1,0,2,14.0,1,0,3,0,,
7,2022-04-25,6,4,2,0,3,1.0,2,2,3,0,0.0,3.0
6,2022-05-23,6,3,28,0,4,7.0,3,28,2,0,2.0,33.0
5,2022-12-29,6,1,220,1,1,9.0,4,220,1,0,228.0,332.0
4,2022-12-30,6,4,1,1,2,8.0,5,1,3,0,228220.0,3321.0


In [12]:
df['y'] = df['review_rating'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x])

def remove_outliers(group: pd.DataFrame) -> pd.DataFrame:
    # threshold = np.mean(group['delta_t']) * 1.5
    # threshold = group['delta_t'].quantile(0.95)
    Q1 = group['delta_t'].quantile(0.25)
    Q3 = group['delta_t'].quantile(0.75)
    IQR = Q3 - Q1
    threshold = Q3 + 1.5 * IQR
    group = group[group['delta_t'] <= threshold]
    return group

df[df['i'] == 2] = df[df['i'] == 2].groupby(by=['r_history', 't_history'], as_index=False, group_keys=False).apply(remove_outliers)
df.dropna(inplace=True)
print(df.shape)

def remove_non_continuous_rows(group):
    discontinuity = group['i'].diff().fillna(1).ne(1)
    if not discontinuity.any():
        return group
    else:
        first_non_continuous_index = discontinuity.idxmax()
        return group.loc[:first_non_continuous_index-1]

df = df.groupby('card_id', as_index=False, group_keys=False).apply(remove_non_continuous_rows)
print(df.shape)
df.head()

(39423, 14)
(38641, 14)


Unnamed: 0,review_date,card_id,Grade,Int,Laps,Rep,expFI,i,delta_t,review_rating,review_time,t_history,r_history,y
8,2022-04-23,6.0,4.0,1.0,0.0,2.0,14.0,1.0,0.0,3.0,0.0,,,1.0
7,2022-04-25,6.0,4.0,2.0,0.0,3.0,1.0,2.0,2.0,3.0,0.0,0.0,3.0,1.0
6,2022-05-23,6.0,3.0,28.0,0.0,4.0,7.0,3.0,28.0,2.0,0.0,2.0,33.0,1.0
5,2022-12-29,6.0,1.0,220.0,1.0,1.0,9.0,4.0,220.0,1.0,0.0,228.0,332.0,0.0
4,2022-12-30,6.0,4.0,1.0,1.0,2.0,8.0,5.0,1.0,3.0,0.0,228220.0,3321.0,1.0


In [13]:
df.to_csv('revlog_history.tsv', sep="\t", index=False)