In [1]:
import pandas as pd
import os
import numpy as np
import regex as re
from dotenv import load_dotenv
import warnings
import json
warnings.filterwarnings("ignore")


load_dotenv()
tscc_path_head = os.getenv('tscc_path_head')
tscc_sample_path = os.getenv('tscc_sample_path')


In [2]:

# getting all the csv files of the chats
total_csv = [i for i in os.listdir(tscc_path_head) if i.endswith(".csv")][0]
data = pd.read_csv(tscc_sample_path, sep = '\t')

meta_data = pd.read_csv(tscc_path_head+total_csv)
meta_data = meta_data[['filename', 'teacher', 'student', 'n.turns','n.words', 'student.cefr.level', 'student.L1']]


In [3]:
chat_filenames = [i for i in os.listdir(tscc_path_head) if i.endswith(".tsv")]
doc_cols =data.columns
total_tscc_data = pd.DataFrame(columns=doc_cols)
for chat_file in chat_filenames:
    temp = pd.read_csv(tscc_path_head+chat_file, sep = '\t')     
    temp['filename'] = chat_file
    total_tscc_data = pd.concat([total_tscc_data, temp], ignore_index=True)

total_tscc_data = total_tscc_data[['timestamp','role','anonymised', 'seq.type', 'focus', 'filename']].merge(meta_data, on='filename')
total_tscc_data['conv.id'] = total_tscc_data['filename'].apply(lambda x: x.split('.')[0][-3:])
total_tscc_data

Unnamed: 0,timestamp,role,anonymised,seq.type,focus,filename,teacher,student,n.turns,n.words,student.cefr.level,student.L1,conv.id
0,2020-10-22T08:31:24+00:00+00:00+00:00,student,hi,opening,,teacherstudentchat00125.tsv,teacher008,student009,222,1171,B2,Ukrainian,125
1,2020-10-22T08:31:39+00:00+00:00+00:00,teacher,Hi <STUDENT>!,,,teacherstudentchat00125.tsv,teacher008,student009,222,1171,B2,Ukrainian,125
2,2020-10-22T08:31:45+00:00+00:00+00:00,teacher,How are you?,topic opening,,teacherstudentchat00125.tsv,teacher008,student009,222,1171,B2,Ukrainian,125
3,2020-10-22T08:31:54+00:00+00:00+00:00,student,I am fine,,,teacherstudentchat00125.tsv,teacher008,student009,222,1171,B2,Ukrainian,125
4,2020-10-22T08:31:59+00:00+00:00+00:00,student,and you?,,,teacherstudentchat00125.tsv,teacher008,student009,222,1171,B2,Ukrainian,125
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41467,2020-11-03 14:32:29,teacher,"Anyway, I'll let you go, have a relaxing after...",closing,,teacherstudentchat00128.tsv,teacher008,student009,200,1356,B2,Ukrainian,128
41468,2020-11-03 14:32:45,teacher,Speak to you on Friday,,,teacherstudentchat00128.tsv,teacher008,student009,200,1356,B2,Ukrainian,128
41469,2020-11-03 14:33:02,student,thank you!,,,teacherstudentchat00128.tsv,teacher008,student009,200,1356,B2,Ukrainian,128
41470,2020-11-03 14:33:08,teacher,Take care :),,,teacherstudentchat00128.tsv,teacher008,student009,200,1356,B2,Ukrainian,128


In [4]:
df = total_tscc_data

df['seq.type'] = df['seq.type'].fillna('')


df.sort_values(by=['conv.id','timestamp'], inplace=True)

# Identify consecutive teacher and student rows
df['turn_id'] = df.groupby('conv.id')['role'].transform(lambda x: (x != x.shift()).cumsum())

# Filter rows with a teacher and student role
teacher_student_df = df[df['role'].isin(['teacher', 'student'])]

# Group by 'filename' and 'conversation_id' and aggregate the 'edited' column
grouped_df = teacher_student_df.groupby(['conv.id', 'turn_id']).agg({
    'timestamp' : 'first',
    'role' : 'first',
    'anonymised': '\n    '.join,
    'n.turns' : 'first',
    'n.words' : 'first',
    'student.cefr.level' : 'first',
    'student.L1' : 'first',
    'seq.type' : ', '.join,
    'focus' : 'first',

}).reset_index()

# Drop the 'conversation_id' column if you don't need it
grouped_df.drop(columns=['turn_id'], inplace=True)

# Group by 'conv.id' and calculate the turn number for each group
grouped_df['turn_number'] = grouped_df.groupby(['conv.id','role']).cumcount() 

# Reset the index of the new DataFrame
grouped_df.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
grouped_df['conversation_id'] = grouped_df['conv.id'].apply(int)
grouped_df= grouped_df.drop(columns=['timestamp', 'conv.id'])
grouped_df

Unnamed: 0,role,anonymised,n.turns,n.words,student.cefr.level,student.L1,seq.type,focus,turn_number,conversation_id
0,teacher,"Hi <STUDENT>, hope I didn't get you up too early!",92,1067,B1,Japanese,opening,,0,2
1,student,"Don't worry, my exam is on next Saturday, so I...",92,1067,B1,Japanese,,,0,2
2,teacher,"Ah OK, so good practice then...is that an IELT...",92,1067,B1,Japanese,topic opening,,1,2
3,student,Exactly.,92,1067,B1,Japanese,,,1,2
4,teacher,I've lost track of how many you've done,92,1067,B1,Japanese,,,2,2
...,...,...,...,...,...,...,...,...,...,...
25835,teacher,"You're very welcome, and thank you!",183,1318,B2,"Russian,Ukrainian",,,54,261
25836,student,xx,183,1318,B2,"Russian,Ukrainian",,,55,261
25837,teacher,"Take care, and speak to you later! xx",183,1318,B2,"Russian,Ukrainian",,,55,261
25838,student,you to!! have a good day!\n bye bye xx,183,1318,B2,"Russian,Ukrainian",",",,56,261


In [5]:
## combining the turns
grouped_df['text'] = (
    grouped_df['role'].str.upper() + ':\n    '  + grouped_df['anonymised']
)
# # Group by 'turn_id' and filter only those groups having exactly two entries
# result = grouped_df.groupby(['conversation_id','turn_number']).filter(lambda x: len(x) == 2)

# Group by 'conversation_id' and 'turn_number'
grouped_df1 = grouped_df.groupby(['conversation_id', 'turn_number'])

# Filter groups to only keep those with exactly two entries
filtered_df = grouped_df1.filter(lambda x: len(x) == 2)

# Join texts from each group
result_df = filtered_df.groupby(['conversation_id', 'turn_number']).agg({
    'text': '\n\n'.join,  
    'seq.type': lambda x: ', '.join(x.unique()), 
    'student.cefr.level': 'first',
    'student.L1': 'first'
}).reset_index()


# # Concatenating the formatted_turns where the condition is met
# result = result.groupby(['conversation_id','turn_number'])['text'].apply('\n\n'.join).reset_index()
result_df

Unnamed: 0,conversation_id,turn_number,text,seq.type,student.cefr.level,student.L1
0,2,0,"TEACHER:\n Hi <STUDENT>, hope I didn't get ...","opening,",B1,Japanese
1,2,1,"TEACHER:\n Ah OK, so good practice then...i...","topic opening,",B1,Japanese
2,2,2,TEACHER:\n I've lost track of how many you'...,", topic development",B1,Japanese
3,2,3,TEACHER:\n Do you mean get a score you get ...,,B1,Japanese
4,2,4,TEACHER:\n Wow that's really good - well do...,,B1,Japanese
...,...,...,...,...,...,...
12868,261,52,STUDENT:\n if ou don't mind I'll ask you an...,,B2,"Russian,Ukrainian"
12869,261,53,STUDENT:\n thank you\n I really enjoyed ...,", closing, ,",B2,"Russian,Ukrainian"
12870,261,54,STUDENT:\n thank you very very much!\n\nTEA...,,B2,"Russian,Ukrainian"
12871,261,55,"STUDENT:\n xx\n\nTEACHER:\n Take care, a...",,B2,"Russian,Ukrainian"


In [20]:
## doccono data
projects_to_include = os.getenv('doccono_project_list', "")

projects_to_include = eval(projects_to_include)

len(projects_to_include)

32

In [15]:
doc_path_head = os.getenv('doc_path_head')
doc_sample_path = os.getenv('doc_sample_path')


In [20]:
with open('/Users/mahathi/Dissertation/log/final_user_proficiency.json', 'r') as file:
    prof = json.load(file)
prof = pd.DataFrame(list(prof.items()), columns=['Participant', 'CEFR'])


In [90]:
prof.to_excel('data/proficiency.xlsx', index=False)

In [14]:
## Xingwei project list
projects_to_include = ['Project_3', 'Project_4', 'Project 4 sp1 part2', 'Project_9', 'Project_7', 'Project_0', 'project_10_sp2_part2', 'project6_sp1_p1', 'Project_7_sp1_part1', 'project0_sp1_part1', 'project_9_sp1_part1', 'Project_5_sp1_comparison', 'Project_2_sp1_part1', 'Project_6', 'Project_2_sp1_part2', 'Project_1', 'project_10_sp2_part1', 'Project_8', 'Project_7_sp1_part2', 'project3_sp1_part2', 'Project 1 sp1 part1', 'project_10_sp1_part2', 'Project 4 sp1 p1 b', 'project0_sp1_part2', 'Project_8_sp1_comparison', 'project_9_sp1_part2', 'Project_2', 'project_10_sp1_part1', 'Project_5', 'project3_sp1_part1', 'Project 1 sp1 part2', 'project6_sp1_p2']

In [17]:
doc_path_head = "/Users/mahathi/Downloads/doccano_output/"
doc_sample_path = "/Users/mahathi/Downloads/doccano_output/Project_1/5eea650223990c15b6063ea8.json"
tot_projs = os.listdir(doc_path_head)

In [18]:
set(tot_projs) - set(projs)

{'.DS_Store',
 'Project 0 sp1 part 1 no comp',
 'Project 0 sp1 part 2 no comp',
 'Project 1 sp1 part 1 no comp',
 'Project 1 sp1 part 2 no comp',
 'Project 3 sp1 part 1 no comp',
 'Project 3 sp1 part 2 no comp',
 'Project 4 sp1 p2 i',
 'Project 4 sp1 part1',
 'project 4 sp1 part1 no comp',
 'project 4 sp1 part2 no comp'}

In [71]:
# prof = pd.read_excel('data/proficiency.xlsx')
data_path1 = '/Users/mahathi/Downloads/doccano_output/Project_0/5de585e4040f535067ca331e.json'
doc_cols = pd.read_json(doc_sample_path).columns
total_annotated_data = pd.DataFrame(columns=doc_cols)
for proj in projects_to_include:
    json_files = os.listdir(doc_path_head+proj)
    json_files = [i for i in json_files if i.endswith('json')]
    for json_file in json_files:
        level = prof[prof['Participant']==json_file.split('.')[0]]['CEFR']
        temp = pd.read_json(doc_path_head+proj+'/'+json_file)
        value_counts = temp['conversation_id'].value_counts()
        if value_counts.get('190', 0) < 12:
            temp = temp[temp['conversation_id']!='190']
        if value_counts.get('777', 0) < 12:
            temp = temp[temp['conversation_id']!='777']
        temp['participant'] = [json_file.split('.')[0]]*len(temp)
        temp['level'] = [level.values]*len(temp)
        total_annotated_data = pd.concat([total_annotated_data, temp], ignore_index=True)

In [72]:
def get_int_val(label_data):
    if len(label_data)>1:
        int_val = label_data[1][-1]
    else:
        int_val = "None"
    return int_val

def remove_extra_space(text):
    text = re.sub('\n\n\n\n', '\n\n', text)
    text = text.lstrip('\n')
    return text

def pre_process(conv_string, label_string):
    # Define patterns to remove
    separation_line = '===========================Separation Line============================='
    rate_student_interesting = "========Rate if this student finds the teacher interesting \(please don't use your own preferences\)========"
    rate_teacher_interesting = "========Rate if this teacher finds the student interesting \(please don't use your own preferences\)========"
    rate_student = "========You are the student, please rate the teacher========"
    rate_teacher = "========You are the teacher, please rate the student========"
    no_alternative = "\n\n================= No alternative available, please choose a comparison label randomly ================="
    rate_student_assigned = "========You are assigned as the student, please rate the teacher========"
    rate_teacher_assigned = "========You are assigned as the teacher, please rate the student========"
    alternative_interesting = "================= Is the following alternative response more interesting\? ================="


    # Remove specific patterns
    res = re.sub(separation_line, '', conv_string)
    res = re.sub(rate_student_interesting, "", res, flags=re.DOTALL)
    res = re.sub(rate_teacher_interesting, "", res, flags=re.DOTALL)
    res = re.sub(rate_student, "", res)
    res = re.sub(rate_teacher, "", res)
    res = re.sub(no_alternative, "", res)
    res = re.sub(rate_student_assigned, "", res)
    res = re.sub(rate_teacher_assigned, "", res)

    return res

def pre_process_alt(conv_string, label_string):
    alternative_interesting = "================= Is the following alternative response more interesting\? ================="

    # For the Alternative re-write conversations
    if re.search(alternative_interesting, conv_string, flags=re.IGNORECASE) :
        parts = re.split(alternative_interesting, conv_string)          
        return parts[0].strip('\n').strip('\n\n')
    else:
        return conv_string

total_annotated_data = total_annotated_data[total_annotated_data['conversation_id']!='None']
total_annotated_data['human_int'] = total_annotated_data['label'].apply(get_int_val).astype(int)
total_annotated_data['conversation_id'] = total_annotated_data['conversation_id'].astype(int)

In [73]:
total_annotated_data = total_annotated_data[['id', 'conversation_id', 'text', 'participant','level','label','human_int']]
total_annotated_data['text0'] = total_annotated_data['text']
total_annotated_data['text'] = total_annotated_data.apply(lambda x: pre_process(x.text, x.label), axis=1)
total_annotated_data['text'] = total_annotated_data.apply(lambda x: pre_process_alt(x.text, x.label), axis=1).apply(remove_extra_space)
total_annotated_data['conversation_id'] = total_annotated_data['conversation_id'].apply(int)
total_annotated_data['annotator_level'] = total_annotated_data['level'].apply(lambda x: x[0] if len(x)>0 else "Unknown")
total_annotated_data = total_annotated_data.drop(columns=['level'])
total_annotated_data


Unnamed: 0,id,conversation_id,text,participant,label,human_int,text0,annotator_level
0,36289,212,"STUDENT:\n Sorry, it's my name as usual )\n...",6101885b0f3e7fdafc8fd5d1,"[EXP INT 0, INT 0]",0,"STUDENT:\n Sorry, it's my name as usual )\n...",Unknown
1,36290,212,"STUDENT:\n Fine, but quite (razbitaya) )\n\...",6101885b0f3e7fdafc8fd5d1,"[EXP INT 1, INT 1]",1,"STUDENT:\n Fine, but quite (razbitaya) )\n=...",Unknown
2,36291,212,STUDENT:\n I don't know if it's the weather...,6101885b0f3e7fdafc8fd5d1,"[EXP INT 3, INT 3]",3,STUDENT:\n I don't know if it's the weather...,Unknown
3,36292,212,"STUDENT:\n I mean I'm OK, but I fell not 10...",6101885b0f3e7fdafc8fd5d1,"[EXP INT 1, INT 1]",1,"STUDENT:\n I mean I'm OK, but I fell not 10...",Unknown
4,36293,212,STUDENT:\n I see! It's so hard to be at hom...,6101885b0f3e7fdafc8fd5d1,"[EXP INT 2, INT 2]",2,STUDENT:\n I see! It's so hard to be at hom...,Unknown
...,...,...,...,...,...,...,...,...
20471,65706,166,"TEACHER:\n Yes, historical is about history...",60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 1, The alternative is worse]",1,"TEACHER:\n Yes, historical is about history...",C1
20472,65707,166,"TEACHER:\n If we say a city is historic, a ...",60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 2, The alternative is better]",2,"TEACHER:\n If we say a city is historic, a ...",C1
20473,65708,166,"TEACHER:\n Yes, absolutely!\n Is your ho...",60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 2, The alternative is better]",2,"TEACHER:\n Yes, absolutely!\n Is your ho...",C1
20474,65709,166,"TEACHER:\n I see!\n Ok, lovely, I'll let...",60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 2, The alternative is better]",2,"TEACHER:\n I see!\n Ok, lovely, I'll let...",C1


In [74]:
total_annotated_data.drop_duplicates(subset=['participant'])['annotator_level'].value_counts()

annotator_level
C1         36
B2         23
Unknown    21
C2         13
A2          3
Name: count, dtype: int64

In [76]:
conv_ids = total_annotated_data['conversation_id'].unique()
result = result_df[result_df['conversation_id'].isin(conv_ids)]
result

Unnamed: 0,conversation_id,turn_number,text,seq.type,student.cefr.level,student.L1
190,7,0,"TEACHER:\n Hi there <STUDENT>, all OK?\n\nS...","opening,",C1,Spanish
191,7,1,"TEACHER:\n Yeah I'm good thanks, just been ...","topic opening, , repair",C1,Spanish
192,7,2,TEACHER:\n Yeah? what did you do? I'll come...,", scaffolding, topic development,",C1,Spanish
193,7,3,TEACHER:\n OK I see ...too bad about the ru...,", , repair",C1,Spanish
194,7,4,TEACHER:\n Yes I realise it's not easy real...,", enquiry",C1,Spanish
...,...,...,...,...,...,...
12711,258,22,STUDENT:\n (I hope I'm not making up an exp...,", scaffolding, scaffolding",C2,Italian
12712,258,23,STUDENT:\n So my reasoning was wrong...ok.....,", enquiry, closing",C2,Italian
12713,258,24,STUDENT:\n Or maybe with money/institutions...,,C2,Italian
12714,258,25,STUDENT:\n 13:59 on my laptop\n\nTEACHER:\n...,", scaffolding, ,",C2,Italian


In [77]:
merged = result.merge(total_annotated_data, on=['text', 'conversation_id'], how='right', indicator=True)
merged

Unnamed: 0,conversation_id,turn_number,text,seq.type,student.cefr.level,student.L1,id,participant,label,human_int,text0,annotator_level,_merge
0,212,0.0,"STUDENT:\n Sorry, it's my name as usual )\n...","opening, , , , , topic opening",B2,"Russian,Ukrainian",36289,6101885b0f3e7fdafc8fd5d1,"[EXP INT 0, INT 0]",0,"STUDENT:\n Sorry, it's my name as usual )\n...",Unknown,both
1,212,1.0,"STUDENT:\n Fine, but quite (razbitaya) )\n\...","non-English, scaffolding",B2,"Russian,Ukrainian",36290,6101885b0f3e7fdafc8fd5d1,"[EXP INT 1, INT 1]",1,"STUDENT:\n Fine, but quite (razbitaya) )\n=...",Unknown,both
2,212,2.0,STUDENT:\n I don't know if it's the weather...,", topic development, ,",B2,"Russian,Ukrainian",36291,6101885b0f3e7fdafc8fd5d1,"[EXP INT 3, INT 3]",3,STUDENT:\n I don't know if it's the weather...,Unknown,both
3,212,3.0,"STUDENT:\n I mean I'm OK, but I fell not 10...",", eliciting",B2,"Russian,Ukrainian",36292,6101885b0f3e7fdafc8fd5d1,"[EXP INT 1, INT 1]",1,"STUDENT:\n I mean I'm OK, but I fell not 10...",Unknown,both
4,212,4.0,STUDENT:\n I see! It's so hard to be at hom...,", , scaffolding,",B2,"Russian,Ukrainian",36293,6101885b0f3e7fdafc8fd5d1,"[EXP INT 2, INT 2]",2,STUDENT:\n I see! It's so hard to be at hom...,Unknown,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19963,166,,"TEACHER:\n Yes, historical is about history...",,,,65706,60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 1, The alternative is worse]",1,"TEACHER:\n Yes, historical is about history...",C1,right_only
19964,166,,"TEACHER:\n If we say a city is historic, a ...",,,,65707,60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 2, The alternative is better]",2,"TEACHER:\n If we say a city is historic, a ...",C1,right_only
19965,166,,"TEACHER:\n Yes, absolutely!\n Is your ho...",,,,65708,60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 2, The alternative is better]",2,"TEACHER:\n Yes, absolutely!\n Is your ho...",C1,right_only
19966,166,,"TEACHER:\n I see!\n Ok, lovely, I'll let...",,,,65709,60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 2, The alternative is better]",2,"TEACHER:\n I see!\n Ok, lovely, I'll let...",C1,right_only


In [88]:
merged['student.cefr.level'] = merged.groupby('conversation_id')['student.cefr.level'].transform(lambda x: x.ffill().bfill())
merged['student.L1'] = merged.groupby('conversation_id')['student.L1'].transform(lambda x: x.ffill().bfill())
merged['seq.type'] = merged['seq.type'].astype(str).replace('nan', '')
def process_seqtype(seq_string):
    # res = re.sub("\s+", "\s", seq_string)
    if seq_string:
        # Strip the string of leading/trailing whitespace and split by comma
        res_list = seq_string.strip().split(",")
        # Filter out empty strings and strip each item to remove extra spaces
        filtered_list = [item.strip() for item in res_list if item.strip()]
        return list(set(filtered_list))
    return []

merged['seq.type'] = merged['seq.type'].apply(process_seqtype)
merged

Unnamed: 0,conversation_id,turn_number,text,seq.type,student.cefr.level,student.L1,id,participant,label,human_int,text0,annotator_level,_merge
0,212,0.0,"STUDENT:\n Sorry, it's my name as usual )\n...","[topic opening, opening]",B2,"Russian,Ukrainian",36289,6101885b0f3e7fdafc8fd5d1,"[EXP INT 0, INT 0]",0,"STUDENT:\n Sorry, it's my name as usual )\n...",Unknown,both
1,212,1.0,"STUDENT:\n Fine, but quite (razbitaya) )\n\...","[non-English, scaffolding]",B2,"Russian,Ukrainian",36290,6101885b0f3e7fdafc8fd5d1,"[EXP INT 1, INT 1]",1,"STUDENT:\n Fine, but quite (razbitaya) )\n=...",Unknown,both
2,212,2.0,STUDENT:\n I don't know if it's the weather...,[topic development],B2,"Russian,Ukrainian",36291,6101885b0f3e7fdafc8fd5d1,"[EXP INT 3, INT 3]",3,STUDENT:\n I don't know if it's the weather...,Unknown,both
3,212,3.0,"STUDENT:\n I mean I'm OK, but I fell not 10...",[eliciting],B2,"Russian,Ukrainian",36292,6101885b0f3e7fdafc8fd5d1,"[EXP INT 1, INT 1]",1,"STUDENT:\n I mean I'm OK, but I fell not 10...",Unknown,both
4,212,4.0,STUDENT:\n I see! It's so hard to be at hom...,[scaffolding],B2,"Russian,Ukrainian",36293,6101885b0f3e7fdafc8fd5d1,"[EXP INT 2, INT 2]",2,STUDENT:\n I see! It's so hard to be at hom...,Unknown,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19963,166,,"TEACHER:\n Yes, historical is about history...",[],B2,Mandarin Chinese,65706,60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 1, The alternative is worse]",1,"TEACHER:\n Yes, historical is about history...",C1,right_only
19964,166,,"TEACHER:\n If we say a city is historic, a ...",[],B2,Mandarin Chinese,65707,60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 2, The alternative is better]",2,"TEACHER:\n If we say a city is historic, a ...",C1,right_only
19965,166,,"TEACHER:\n Yes, absolutely!\n Is your ho...",[],B2,Mandarin Chinese,65708,60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 2, The alternative is better]",2,"TEACHER:\n Yes, absolutely!\n Is your ho...",C1,right_only
19966,166,,"TEACHER:\n I see!\n Ok, lovely, I'll let...",[],B2,Mandarin Chinese,65709,60317fc2c8f6320ecde4bcb3,"[EXP INT 2, INT 2, The alternative is better]",2,"TEACHER:\n I see!\n Ok, lovely, I'll let...",C1,right_only


In [91]:
merged.to_excel('data/tscc_with_doccono_annotations.xlsx', index=False)

In [55]:
19968/3

6656.0