# Script for preprocess of corpus in CSL dataset

In [1]:
f = open('dictionary.txt',encoding='utf-8')
words = f.readlines()

## Build isl dictionary

In [40]:
isl_dictionary = {}
for word in words:
    data = word.rstrip('\n').split()
    index = int(data[0])
    token = data[1]
    isl_dictionary[token] = index
def reverse_dictionary(dictionary):
    reverse_dict = {}
    for k,v in dictionary.items():
        reverse_dict[v] = k
    return reverse_dict
isl_dictionary
for k in range(500):
    if not k in isl_dictionary.values():
        print(k)

## Build csl dictionary and reverse dictionary

In [31]:
def manual_cut(word):
    if word == '女朋友':
        return ['女','朋友']
    elif word == '现实情况':
        return ['现实','情况']
    elif word == '自由恋爱':
        return ['自由','恋爱']
    elif word == '扭转局面':
        return ['扭转','局面']
    elif word == '事业成功':
        return ['事业','成功']
    elif word == '经验丰富':
        return ['经验','丰富']
    elif word == '有雨':
        return ['有','雨']
    elif word == '他人':
        return ['别人']
    elif word == '圆满成功':
        return ['圆满','成功']
    elif word == '针线':
        return ['针','线']
    elif word == '星星':
        return ['星']
    elif word == '小孩子':
        return ['小孩儿（儿童、少年）']
    else:
        return [word]

In [32]:
import jieba
from collections import Counter

def build_csl_dictionary():
    annotation_file = open("corpus.txt",'r')
    corpus = annotation_file.readlines()
    corpus = [sentence.rstrip('\n').split()[1] for sentence in corpus]
    # Create a dictionary which maps tokens to indices (train contains all the training sentences)
    freq_list = Counter()
    punctuation = ['\ufeff']
    for sentence in corpus:
        tmp_sentence = []
        for word in jieba.cut(sentence):
            if not word in punctuation:
                word = manual_cut(word)
                tmp_sentence.extend(word)
        freq_list.update(tmp_sentence)

    # 按照词的出现频率建立词典，词频越高索引越靠前
    freq_list = sorted(freq_list.items(),key=lambda item:item[1],reverse=True)
    dictionary = {}
    dictionary['<pad>'] = 0
    dictionary['<bos>'] = 1
    dictionary['<eos>'] = 2
    for i,item in enumerate(freq_list):
        dictionary[item[0]] = i+3
    print("Build CSL dictionary successfully!")
    return dictionary
csl_dictionary = build_csl_dictionary()
reverse_dict = reverse_dictionary(dictionary)

Build CSL dictionary successfully!


## Update isl_dictionary to handle the words with same meanings

In [38]:
def isIndict(k):
    global isl_dictionary
    for word in isl_dictionary.keys():
        if k == word:
            return word
        elif k in word and k!='的' and k!='有':
            print(k,' ',word)
            isl_dictionary[k] = isl_dictionary[word]
            return word
    return -1
isl_in_csl_dictionary = {}
for k in csl_dictionary.keys():
    word = isIndict(k)
    if word!=-1:
        index = isl_dictionary[word]
        isl_in_csl_dictionary[k] = index
# isl_in_csl_dictionary = sorted(isl_in_csl_dictionary.items(),key=lambda item:item[1])

他   他（她、它）
外祖父   外祖父（外公）
祖父   祖父（爷爷）
基础   基础（根据）
成功   成效（成功）
她   他（她、它）
儿子   儿子（男孩）
改善   改善（改良）
好   好转
女   女儿（女孩）
医生   医生（大夫）
外祖母   外祖母（外婆）
祖母   祖母（奶奶）
歪   歪（倾向）
颜色   颜色（彩色）
锋利   尖（锋利、尖锐）
放弃   放弃（放）
牙刷   牙刷（刷牙）
没有   没有（无）
去   去（出）
平等   平（平等）


In [None]:
# Generate subset file for validation
subset_index_list = [record[1] for record in isl_in_csl_dictionary]

import os
import os.path as osp

def create_path(path):
    if not osp.exists(path):
        os.makedirs(path)

num_class = 500
color_video_root = "/home/liweijie/SLR_dataset/S500_color_video"
skeleton_root = "/home/liweijie/SLR_dataset/xf500_body_color_txt"
val_list = open("../input/subset_val_list.txt","w")

color_video_path_list = os.listdir(color_video_root)
color_video_path_list.sort()
n = len(color_video_path_list)
for i,color_video_path in enumerate(color_video_path_list):
    print("%d/%d"%(i,n))
    label = color_video_path
    abs_color_video_path = osp.join(color_video_root,color_video_path)
    color_video_list = os.listdir(abs_color_video_path)
    color_video_list.sort()
    index = int(label)
    if index in subset_index_list:
        for color_video in color_video_list:
            abs_color_video = osp.join(abs_color_video_path,color_video)
            if(osp.isdir(abs_color_video)):
                p = color_video.split('_')
                person = int(p[0].lstrip('P'))
                num_frames = len(os.listdir(abs_color_video))
                path = osp.join(color_video_path,color_video)
                if not '(' in path:
                    path_skeleton = path.rstrip("color")+"body.txt"
                    abs_path_skeleton = osp.join(skeleton_root,path_skeleton)
                    if osp.exists(abs_path_skeleton):
                        record = path+"\t"+path_skeleton+"\t"+\
                                            str(num_frames)+"\t"+color_video_path+"\n"
                        val_list.write(record)
                    else:
                        print("The skeleton path %s don't exist"%abs_path_skeleton)

In [None]:
import jieba
f = open('corpus.txt','r')
sentences = f.readlines()
count = 0
csl_dictionary = {}
punctuation = [' ','\n','\ufeff']
for sentence in sentences:
    words = jieba.cut(sentence.rstrip('\n'))
    for word in words:
        if word not in csl_dictionary.values() and word not in punctuation and '0' not in word:
            csl_dictionary[word] = count
            count += 1

In [41]:
import os
import spacy
import time
import jieba
import json
import sys
import numpy
sys.path.append('..')
from utils.textUtils import *
convert_chinese_to_indices('结果圆满成功﻿')

TypeError: convert_chinese_to_indices() missing 2 required positional arguments: 'dictionary' and 'add_two_end'

In [None]:
isl_dictionary