## Processing input datasets
* https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
* https://nlp.cs.washington.edu/xorqa/ 

Sections 1-2 process the two datasets into common json format so that we can retain
metadata about the distribution of languages and source datasets in our combined intermediate dataset.

The last section processes the combined json dataset into prefixes + next-chars and 
saves them as txt files, with each example on a single line. THIS IS WHAT WE WILL USE
FOR TRAINING + PREDICTIONS IN `src/myprogram` 

In [1]:
import os
import pandas as pd
import json

### 1. Cornell Movie Dialogs Corpus
(English-only)

In [5]:
import kagglehub

# Download latest version
metadata_path = kagglehub.dataset_download("Cornell-University/movie-dialog-corpus", path="movie_titles_metadata.tsv")
movie_lines_path = kagglehub.dataset_download("Cornell-University/movie-dialog-corpus", path="movie_lines.tsv")
print(metadata_path)
print(movie_lines_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/Cornell-University/movie-dialog-corpus?dataset_version_number=1&file_name=movie_titles_metadata.tsv...


100%|██████████| 41.0k/41.0k [00:00<00:00, 563kB/s]


Downloading from https://www.kaggle.com/api/v1/datasets/download/Cornell-University/movie-dialog-corpus?dataset_version_number=1&file_name=movie_lines.tsv...


100%|██████████| 7.95M/7.95M [00:00<00:00, 14.8MB/s]

Extracting zip of movie_lines.tsv...





/Users/molly/.cache/kagglehub/datasets/Cornell-University/movie-dialog-corpus/versions/1/movie_titles_metadata.tsv
/Users/molly/.cache/kagglehub/datasets/Cornell-University/movie-dialog-corpus/versions/1/movie_lines.tsv


In [4]:
# moved the downloaded files to project directory bc it saves to local cache by default
# cp /Users/molly/.cache/kagglehub/datasets/Cornell-University/movie-dialog-corpus/versions/1/movie_lines.tsv ../raw_data/movie_lines.tsv
# cp /Users/molly/.cache/kagglehub/datasets/Cornell-University/movie-dialog-corpus/versions/1/movie_titles_metadata.tsv ../raw_data/movie_titles_metadata.tsv

'/Users/molly/.cache/kagglehub/datasets/Cornell-University/movie-dialog-corpus/versions/1/movie_lines.tsv'

In [10]:
movie_metadata = pd.read_csv("../raw_data/movie_titles_metadata.tsv", sep='\t', header=None)
movie_metadata.columns = ["movie_id", "movie_title", "movie_year", "imdb_rating", "imdb_votes", "genres"]

# filter to sci-fi movies
scifi_filter = movie_metadata['genres'].apply(lambda x: ('sci-fi' in x) if isinstance(x, str) else False)
scifi_df = movie_metadata[scifi_filter]
print(len(scifi_df))
scifi_df.head()

112


Unnamed: 0,movie_id,movie_title,movie_year,imdb_rating,imdb_votes,genres
3,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
5,m5,the fifth element,1997,7.5,133756.0,['action' 'adventure' 'romance' 'sci-fi' 'thri...
9,m9,the atomic submarine,1959,4.9,513.0,['sci-fi' 'thriller']
12,m12,airplane ii: the sequel,1982,5.8,15210.0,['comedy' 'romance' 'sci-fi']
14,m14,alien nation,1988,6.1,5590.0,['crime' 'drama' 'sci-fi' 'thriller']


In [11]:
# get movie lines from the above sci-fi movies that are longer than 10 chars
movie_lines = pd.read_csv("../raw_data/movie_lines.tsv", sep='\t', on_bad_lines='skip', header=None)
movie_lines.columns = ["line_id", "character_id", "movie_id", "character_name", "text"]
scifi_movie_lines = pd.merge(scifi_df['movie_id'], movie_lines, on='movie_id', how='inner')
length_filter = scifi_movie_lines['text'].apply(lambda x: isinstance(x, str) and len(x) > 10)

scifi_movie_lines = scifi_movie_lines[length_filter]
print(f"{len(scifi_movie_lines)} lines, {len(scifi_movie_lines['movie_id'].unique())} movies")
scifi_movie_lines.head()

38025 lines, 111 movies


Unnamed: 0,movie_id,line_id,character_id,character_name,text
0,m3,L3778,u55,FLOYD,We're trying to get there. I hope we can.
2,m3,L3771,u55,FLOYD,I'm sorry Dr. Smyslov but I'm really not at li...
4,m3,L3757,u55,FLOYD,How did they manage to do that without any com...
6,m3,L3750,u55,FLOYD,Well I suppose they've been having a bit of tr...
8,m3,L3729,u55,FLOYD,She's wonderful.


In [16]:
# currently in use: dump as json
json_data = []
for i,row in scifi_movie_lines.iterrows():
  json_data.append({
    "text": row["text"],
    "lang": "en",
    "source": "cornell_movie_dialogs"
  })

json_data[0:10]

[{'text': "We're trying to get there. I hope we can.",
  'lang': 'en',
  'source': 'cornell_movie_dialogs'},
 {'text': "I'm sorry Dr. Smyslov but I'm really not at liberty to discuss this.",
  'lang': 'en',
  'source': 'cornell_movie_dialogs'},
 {'text': 'How did they manage to do that without any communication?',
  'lang': 'en',
  'source': 'cornell_movie_dialogs'},
 {'text': "Well I suppose they've been having a bit of trouble with some of the equipment.",
  'lang': 'en',
  'source': 'cornell_movie_dialogs'},
 {'text': "She's wonderful.", 'lang': 'en', 'source': 'cornell_movie_dialogs'},
 {'text': "I'm afraid I've only got a few minutes but I'd love to.",
  'lang': 'en',
  'source': 'cornell_movie_dialogs'},
 {'text': "Well how nice to see you again Elena. You're looking wonderful.",
  'lang': 'en',
  'source': 'cornell_movie_dialogs'},
 {'text': 'SPACE STATTION 5 - LOUNGE',
  'lang': 'en',
  'source': 'cornell_movie_dialogs'},
 {'text': "I think I'll have to go out and burn them off

In [20]:
out_data_path = "../data/scifi_movie_lines.jsonl"
overwrite = False
if not os.path.exists(out_data_path) or overwrite:
  with open(out_data_path, "w", encoding="utf-8") as f:
    for item in json_data:
        f.write(json.dumps(item) + "\n")
else:
  print("data already loaded, no files changed.")

### 2. XOR QA 
(Arabic, Bengali, Finnish, Japanese, Korean, Russian, and Telugu)

In [27]:
import random

In [21]:
in_data_path = "../raw_data/xor_train_full.jsonl"

json_data = []
with open(in_data_path, "r", encoding="utf-8") as f:
   for line in f:
      jline=json.loads(line)
      json_data.append({
         "text": jline["question"],
         "lang": jline["lang"],
         "source": "xorqa"
      })
print(len(json_data))
json_data[0:10]

61360


[{'text': 'উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ্য প্রদর্শন করে ?',
  'lang': 'bn',
  'source': 'xorqa'},
 {'text': 'দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?',
  'lang': 'bn',
  'source': 'xorqa'},
 {'text': 'মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন যুক্তরাষ্ট্রে পুরুষ সমকামী বিবাহ কি আইনত বৈধ ?',
  'lang': 'bn',
  'source': 'xorqa'},
 {'text': 'আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃত্যু হয়েছিল ?',
  'lang': 'bn',
  'source': 'xorqa'},
 {'text': 'বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?',
  'lang': 'bn',
  'source': 'xorqa'},
 {'text': 'প্রথম বিশ্বযুদ্ধের আনুষ্ঠানিক সূচনা কবে হয় ?',
  'lang': 'bn',
  'source': 'xorqa'},
 {'text': 'মানুষের উদ্ভবের ও বিকাশের বিবর্তন তত্ত্বটির প্রথম বিরোধিতা কে করেন ?',
  'lang': 'bn',
  'source': 'xorqa'},
 {'text': 'মানুষের উদ্ভবের ও বিকাশের বিবর্তন তত্ত্বটির উদ্ভাবক কে ?',
  'lang': 'bn',
  'source': 'xorqa'},
 {'text': 'মার্কিন যুক্তরাষ্ট্রের রিপাবলিকান পার্টির প্রথম প্রেসিডেন্ট কে ছিলেন ?',
  'lang': 'bn',
  'source': 'xorqa'}

In [29]:
out_data_path = "../data/xor_qa.jsonl"
sample_json_data = random.sample(json_data, k=10000)
overwrite = True
if not os.path.exists(out_data_path) or overwrite:
  with open(out_data_path, "w", encoding="utf-8") as f:
    for item in sample_json_data:
        f.write(json.dumps(item) + "\n")
else:
  print("data already loaded, no files changed.")

### 3. DailyDialog
(English, Italian, German, and Chinese)

In [None]:
def process_text_to_json(input_file, language, source):
    with open(input_file, 'r') as f:
        content = f.read()

    sentences = re.split(r' __eou__ |[\n.]+', content)

    json_data = []

    for line in sentences:
        line = line.strip()
        if line:
            json_data.append({
            'text': line,
            'lang': language,
            'source': source
            })

    return json_data

In [None]:
out_data_path = '../json_data/dailydialog.json'
input_files = ['dialogues_text_En.txt', 'dialogues_text_It.txt', 'dialogues_text_Zh.txt', 'dialogues_text_De.txt']
languages = ['en', 'it', 'zh', 'de']
source = 'dailydialog'

combined_json = []

for input_file, language in zip(input_files, languages):
  if os.path.exists(input_file):
    language_json = process_text_to_json(input_file, language, source)
    combined_json.extend(language_json)
  else:
    print('input file ', input_file, 'does not exist')

# with open('dailydialog.json', 'w', encoding='utf-8') as f:
#     json.dump(combined_json, json_file, indent=4, ensure_ascii=False)

if not os.path.exists(out_data_path) or overwrite:
  with open(out_data_path, 'w', encoding='utf-8') as f:
    for item in combined_json:
        f.write(json.dumps(item) + "\n")
else:
  print("data already loaded, no files changed.")

### Combine everything into a single json dataset with metadata

In [45]:
all_json_data = []
with open("../json_data/scifi_movie_lines.jsonl", "r", encoding="utf-8") as f:
   for line in f:
      jline=json.loads(line)
      all_json_data.append(jline)
with open("../json_data/xor_qa.jsonl", "r", encoding="utf-8") as f:
   for line in f:
      jline=json.loads(line)
      all_json_data.append(jline)
with open("../json_data/dailydialog.jsonl", "r", encoding="utf-8") as f:
   for line in f:
      jline=json.loads(line)
      all_json_data.append(jline)

random.shuffle(all_json_data)

val_start = int(0.8 * len(all_json_data))
test_start = int(0.9 * len(all_json_data))
train_data = all_json_data[:val_start]
val_data = all_json_data[val_start:test_start]
test_data = all_json_data[test_start:]

for split,data in zip(["train", "val", "test"], [train_data, val_data, test_data]):
   with open(f"../json_data/{split}_data.jsonl", "w", encoding="utf-8") as f:
      for item_to_write in data:
         f.write(json.dumps(item_to_write) + "\n")

## Convert to prefix-suffix form for final dataset


In [46]:
import string
import random

def clean_text(text):
    text = text.lower()
    punct_removal = str.maketrans({c:"" for c in set(string.punctuation) if c != "."})
    text = text.translate(punct_removal)
    text = text.replace('\n', ' ').replace('\r','').replace('\t', '')
    text = text.strip()
    text = " ".join(text.split())
    return text

def make_dataset(json_data_path: str):
  all_lines = []
  with open(json_data_path, "r", encoding="utf-8") as f:
      for line in f:
          item = json.loads(line)
          all_lines.append(item["text"])

  # break lines into prefixes + next-char
  all_prefixes = []
  all_next_chars = []
  min_len = 3
  for line in all_lines:
      prefixes = [line[:i+1] + "\n" for i in range(min_len, len(line)-1)]
      next_chars = [line[i] + "\n" for i in range(min_len + 1, len(line))]
      all_prefixes.extend(prefixes)
      all_next_chars.extend(next_chars)

  # shuffling and sampling to save training time
  indices = random.sample(range(1, len(all_prefixes)), 10240)
  prefix_sample = list(map(lambda i: all_prefixes[i], indices))
  next_char_sample = list(map(lambda i: all_next_chars[i], indices))

  return {
      "inputs": prefix_sample,
      "labels": next_char_sample
  }

def make_fixed_len_dataset(json_data_path: str, n=20, n_samples=10240):
    all_lines = []
    with open(json_data_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            all_lines.append(item["text"])
    all_prefixes = []
    all_next_chars = []
    for line in all_lines:
        prefixes = [line[i-n:i] + "\n" for i in range(n, len(line))]
        next_chars = [line[i] + "\n" for i in range(n, len(line))]
        all_prefixes.extend(prefixes)
        all_next_chars.extend(next_chars)
    
    # shuffling and sampling to save training time
    indices = random.sample(range(1, len(all_prefixes)), n_samples)
    prefix_sample = list(map(lambda i: all_prefixes[i], indices))
    next_char_sample = list(map(lambda i: all_next_chars[i], indices))

    return {
        "inputs": prefix_sample,
        "labels": next_char_sample
    }

In [None]:
train_data_path = "../json_data/train_data.jsonl"
val_data_path = "../json_data/val_data.jsonl"
test_data_path = "../json_data/test_data.jsonl"

n_samples = {
  "train": 10240,
  "val": 1024,
  "test": 1024,
}
n = 20
output_dir = f"../data_prev_{n}"
for split,json_data_path in zip(["train", "val", "test"], [train_data_path, val_data_path, test_data_path]):
  os.makedirs(f"{output_dir}/{split}", exist_ok=True)
  data_dict = make_fixed_len_dataset(json_data_path, n=n, n_samples=n_samples[split])
  with open(f"{output_dir}/{split}/{split}_inputs.txt", "w", encoding="utf-8") as input_f:
    input_f.writelines(data_dict["inputs"])
  with open(f"{output_dir}/{split}/{split}_labels.txt", "w", encoding="utf-8") as labels_f:
    labels_f.writelines(data_dict["labels"])