Skip to content
Permalink
Browse files

Initial 4CC Automation Script Commit

  • Loading branch information
Jason Kopp
Jason Kopp committed May 30, 2019
1 parent 7206caa commit 78a0f6841fc2f6ba573bd081c26e9fbccfd12bd8
Showing with 277 additions and 0 deletions.
  1. +204 −0 4CC_Automation/4CCAutomationScript.py
  2. +41 −0 4CC_Automation/README.md
  3. +26 −0 CSV/textualcontent.csv
  4. +3 −0 CSV/unlisted.csv
  5. +3 −0 README.md
@@ -0,0 +1,204 @@
import csv, re, os
from datetime import datetime

def check4CCs(codecFile, newFileName, userSpec):
print("-----------------------------------------------------------")

# Find 4CCs in Codec File----------------------------------------------------------
with open(codecFile, 'r') as file:
codesStripped = []
sentenceList = []
codesCurly = []
sentencesCurly = []

f = file.readlines()

for line in f:
#stip spaces/tabs and hyphens out of lines.
strippedLine = re.sub('\s+\-', ' ', line)

regex = "[\'\‘\’][A-Za-z0-9 +-]{4}[\'\‘\’]"
codesQuotes = re.findall(regex, strippedLine)
if codesQuotes:
for i in codesQuotes:
codeNoSpaces = i.replace(' ', '$20')
codeNoQuotes = codeNoSpaces.strip("\'|\‘|\’")
codesStripped.append(codeNoQuotes)
sentenceList.append(codeNoQuotes + " - " + strippedLine)

regexCurly = "[\‘\’][A-Za-z0-9 +-]{4}[\‘\’]"
codesBad = re.findall(regexCurly, strippedLine)
if codesBad:
for i in codesBad:
codeBadNoSpaces = i.replace(' ', '$20')
codeBadNoQuotes = codeBadNoSpaces.strip("\'|\‘|\’")
codesCurly.append(codeBadNoQuotes)
sentencesCurly.append([codeBadNoQuotes, strippedLine])

fileCodes = sorted(set(codesStripped))

# Find Identifiers CSV Files----------------------------------------------------------
csvCodes = []
codesInCSV = []
for fileName in os.listdir(CSVFileDirectory):
if fileName.endswith(".csv"):
with open(CSVFileDirectory+fileName, 'r') as csvfile:
csvReader = csv.DictReader(csvfile)
headers = csvReader.fieldnames
if 'code' in headers:
for row in csvReader:
csvCode = row['code']
csvFile = fileName.lower()
csvLine = str(list(row.values()))
if 'specification' in headers:
csvSpec = row['specification'].lower()
else:
csvSpec = "No spec"

csvCodes.append(csvCode)
codesInCSV.append([csvCode, csvSpec, csvFile, csvLine])

# Comparing CSV codes and codec codes-------------------------------------------------------------------------------
registeredSame = []
registeredDiff = []
unlistedSpecs = []
textualSpecs = []
missingSpecs = []
curlySpecsList = []

for filecode in fileCodes:
smallSentenceList=[]
for sentence in sentenceList:
sentenceCode, sentenceSentence = sentence.split(' - ')
if filecode == sentenceCode:
smallSentenceList.append(sentenceSentence)
sentences = str(len(smallSentenceList)) + ": " + str(smallSentenceList).strip("[]")

if filecode in csvCodes:
for i in range(len(codesInCSV)):
if filecode == codesInCSV[i][0]:
if codesInCSV[i][2] != "unlisted.csv" and codesInCSV[i][2] != "textualcontent.csv":
if userSpec == codesInCSV[i][1]:
registeredSame.append([filecode, userSpec, codesInCSV[i][1], codesInCSV[i][2], codesInCSV[i][3].strip("[]"), sentences])
if userSpec != codesInCSV[i][1]:
registeredDiff.append([filecode, userSpec, codesInCSV[i][1], codesInCSV[i][2], codesInCSV[i][3].strip("[]"), sentences])
# If userspec != the csv code's spec, AND the code isnt somewhere else in the csvCodes (excluding the current code, i).
# Needed to add this because, if the same 4cc was in two spec in unlisted.csv. It would show up as in unlisted and missing.
elif codesInCSV[i][2] == "unlisted.csv":
if userSpec == codesInCSV[i][1]:
unlistedSpecs.append([filecode, userSpec, codesInCSV[i][1], codesInCSV[i][2], codesInCSV[i][3].strip("[]"), sentences])
# the userspec doesnt equal the code's spec and the code in the csv file isn't in any other CSV files.
elif userSpec != codesInCSV[i][1] and codesInCSV[i][0] not in csvCodes[0:i] and codesInCSV[i][0] not in csvCodes[i+1:]:
missingSpecs.append([filecode, userSpec, sentences])
elif codesInCSV[i][2] == "textualcontent.csv":
if userSpec == codesInCSV[i][1]:
textualSpecs.append([filecode, userSpec, codesInCSV[i][1], codesInCSV[i][2], codesInCSV[i][3].strip("[]"), sentences])
elif userSpec != codesInCSV[i][1] and codesInCSV[i][0] not in csvCodes[0:i] and codesInCSV[i][0] not in csvCodes[i+1:]:
missingSpecs.append([filecode, userSpec, sentences])
else:
missingSpecs.append([filecode, userSpec, sentences])

#Curly Quotes minus the 4ccs we put in textual.csv
setOfTextCodes = set([textualSpecs[i][0] for i in range(len(textualSpecs))])
setofCurlies = set(codesCurly)
curlySpecs = setofCurlies.difference(setOfTextCodes)
for i in range(len(sentencesCurly)):
if sentencesCurly[i][0] in curlySpecs:
curlySpecsList.append([sentencesCurly[i][0], userSpec, sentencesCurly[i][1]])

# Creating New File-------------------------------------------------------------------------------
with open(newFileName, 'w') as newFile:
csvWriter = csv.writer(newFile)

if registeredSame:
csvWriter.writerow(["", "", "", "REG To This Spec", "", ""])
csvWriter.writerow(["CODE", "INPUT SPEC", "REG SPEC", "CSV FILE", "CSV ROW", "SENTENCES"])
for i in registeredSame:
csvWriter.writerow([i[0], i[1], i[2], i[3], i[4], i[5]])

if registeredDiff:
csvWriter.writerow(["", "", "", "", "", ""])
csvWriter.writerow(["", "", "", "", "", ""])
csvWriter.writerow(["", "", "", "Reg To Other Spec", "", ""])
csvWriter.writerow(["CODE", "INPUT SPEC", "REG SPEC", "CSV FILE", "CSV ROW", "SENTENCES"])
for i in registeredDiff:
csvWriter.writerow([i[0], i[1], i[2], i[3], i[4], i[5]])

if unlistedSpecs:
csvWriter.writerow(["", "", "", "", "", ""])
csvWriter.writerow(["", "", "", "", "", ""])
csvWriter.writerow(["", "", "", "Listed as Unreg", "", ""])
csvWriter.writerow(["CODE", "INPUT SPEC", "REG SPEC", "CSV FILE", "CSV ROW", "SENTENCES"])
for i in unlistedSpecs:
csvWriter.writerow([i[0], i[1], i[2], i[3], i[4], i[5]])

if textualSpecs:
csvWriter.writerow(["", "", "", "", "", ""])
csvWriter.writerow(["", "", "", "", "", ""])
csvWriter.writerow(["", "", "", "Listed as Text", "", ""])
csvWriter.writerow(["CODE", "INPUT SPEC", "REG SPEC", "CSV FILE", "CSV ROW", "SENTENCES"])
for i in textualSpecs:
csvWriter.writerow([i[0], i[1], i[2], i[3], i[4], i[5]])

if missingSpecs:
csvWriter.writerow(["", "", "", "", "", ""])
csvWriter.writerow(["", "", "", "", "", ""])
csvWriter.writerow(["", "", "", "Missing", "", ""])
csvWriter.writerow(["CODE", "INPUT SPEC", "REG SPEC", "CSV FILE", "CSV ROW", "SENTENCES"])
for i in missingSpecs:
csvWriter.writerow([i[0], i[1], "", "", "", i[2]])

if curlySpecsList:
csvWriter.writerow(["", "", "", "", "", ""])
csvWriter.writerow(["", "", "", "", "", ""])
csvWriter.writerow(["", "", "", "Uses Curly Quotes", "", ""])
csvWriter.writerow(["CODE", "INPUT SPEC", "REG SPEC", "CSV FILE NAME", "CSV ROW", "SENTENCES"])
for i in curlySpecsList:
csvWriter.writerow([i[0], i[1], "", "", "", i[2]])

# Test-------------------------------------------------------------------------------
missingCodes = set(fileCodes) - set(csvCodes)
print("Codes in file: %d" % len(codesStripped))
print("Sentences in file: %d" % len(sentenceList))
print("Unique codes in file: %d" % len(fileCodes))
print("Curly quotes: %d" % len(curlySpecs))
print("In CSV files: %d" % len(codesInCSV))
print("Missing: %d" % len(missingCodes))

print("-----------------------------------------------------------")
return
# End check4CCs Function----------------------------------------------------------


CodecFileDirectory = "specs/"
CSVFileDirectory = "../CSV/"
outputDirectory = "output/" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S/")
os.mkdir(outputDirectory)

# Start Loop to check the entire CodecFileDirectory------------------------------------------
for fileName1 in os.listdir(CodecFileDirectory):
if fileName1.startswith(("~",".")):
print("%s is not an acceptable file format: " % fileName1)
elif fileName1.endswith(".txt"):
codecFile = CodecFileDirectory+fileName1
newFileName = outputDirectory + fileName1.strip(".txt") + "-missing.csv"
while True:
print(fileName1)
userSpec = input("Please enter a specification: ")
userSpec = userSpec.lower()
if userSpec == "":
print("You didn't type anything.")
else:
break
check4CCs(codecFile, newFileName, userSpec)
# End Loop to check the entire CodecFileDirectory------------------------------------------


# Comment out the above loop and use this code to run only one specification from the specs/ folder at a time:
# specification_to_test = "ISO-IEC_FDIS_14496-12-2018.txt"
#
# codecFile = CodecFileDirectory + specification_to_test
# newFileName = outputDirectory + specification_to_test.strip(".txt") + "-missing.csv"
# print(specification_to_test)
# userSpec = input("Please enter a specification: ")
# check4CCs(codecFile, newFileName, userSpec)
@@ -0,0 +1,41 @@
# 4CC Automation script
Created by Jason Kopp for David Singer of Apple Inc.
First uploaded to Github: 5/30/2019
Last updated: 5/30/2019

## Description
The 4CCAutomationScript.py was created to automate finding unregistered or mistakenly registered specifications on the MP4RA website. The script takes a folder of specification files (4CC_Automation/specs/text/) and the folder of CSV files from the MP4RA (CSV/), finds all the four character codes (4CCs) in each, and compares what is in the specification files to what is registered in the MP4RA.

## unlisted.csv and textualcontent.csv
We created two new MP4RA CSV files: CSV/unlisted.csv and CSV/textualcontent.csv.
- unlisted.csv should store 4CCs that are purposely unlisted/unregistered. Meaning, we know they are missing from the MP4RA and they should stay that way, at least for now.
- textualcontent.csv should store four character long strings that are mistakenly found by this script but are not 4CCs. This script finds any four character long strings that are between single quotes (The regex being used is: `[\'\‘\’][A-Za-z0-9 +-]{4}[\'\‘\’]`) so mistakes happen (i.e. " to ", "also", etc.).

## Output
The script outputs a CSV file (into 4CC_Automation/output/) for each specification file that is in the specs/ folder with:

1. Code-points that are in an RA CSV file and registered to this spec. (good)
2. Code-points that are in an RA CSV file and registered to a different spec. (probably a citation)
3. Code-points that are in a special CSV file (unlisted.csv), listed as for this spec., that documents deliberately unregistered 4CCs (so they don’t repeatedly show up as 'missing')
4. 4-character patterns that are actually not 4CCs at all, that are in a special CSV file (textualcontent.csv), listed as for this spec., that documents text that looks like a 4CC but is not (e.g. like ’this’).
5. Code-points that seem to be unregistered/missing\*
6. 4CCs that use curly quotes, not straight ones, and are not in the textualcontent.csv file listed against this spec.\*

\* Ideally, there is nothing in section 5 or 6. If there is, it should be easy to take these rows, and make pull requests as needed against the CSV files to register the codes reported in (5). To fix (6), either list them as textual content, or if they are 4CCs in curly quotes, fix the text.

## How to Run
To run this script on your own specifications:
- Export your specifications from Microsoft Word to the "specs" folder as text files, encoded as unicode (UTF-8).
- Run "MP4RA-Automate.py” in the terminal
- For each spec in the “specs” folder, it will prompt you for the short code of that specification (as listed at mp4ra.org). For example:
```
$ Python3 4CCsAutomationScript.py
-
heif-w18310_23008-12_Ed2_FDIS+COR1_R1.txt
Please enter a specification: heif
-
miaf-wXXXXX-FDIS-MIAF-RB-3.txt
Please enter a specification: miaf
```
- Note: You can also comment-out a loop at the bottom of the script and un-comment the code below it to test one specification at a time.
- The resulting csv files will be saved in “results/[Date]/”
@@ -0,0 +1,26 @@
code,sentence,specification
file,are indicated as being at ‘file’ level,partial
heif,MIME subtype name SHALL be 'heif',heif
$20or$20,may be indexed in a ‘hierarchical’ or ‘daisy-chain’,iso
$20to$20,change from ‘incomplete’ to ‘complete’,iso
also,Y is also required because the ‘also’ has,iso
atom,entry Called ‘atom’ in some specifications,iso
file,are indicated as being at ‘file’ level,iso
font,type 'font' may be used to indicate,iso
name,arbitrary integer ‘name’ for this resource,iso
walk,It must be possible to ‘walk’ the,iso
true,must be ‘true’ for the entire movie,iso
type,speak of a single ‘type’ or ‘brand’ for the file,iso
wrap,in this format do not ‘wrap’ to 0,iso
hint,Protocol ‘hint’ tracks 19,iso
warp,ask the server to ‘warp’ the actual transmission,iso
sync,hat constitutes a valid ‘sync’,iso
1111,tbit(4) reserved = '1111'b,nalu video
size,The ‘size’ field of the container box,nalu video
$20or$20,indexed in a 'hierarchical' or 'daisy-chain',3gpp
3gpp,File extension(s):\t'3gp' and '3gpp' are both declared,3gpp
Name,'Name' should express a high-level location,3gpp
PG13,the corresponding rating entity e.g. 'PG13',3gpp
star,user 'star' rating of the media,3gpp
VXYZ,four character code of the manufacturer of the codec e.g. 'VXYZ',3gpp
subs,sub-sample index “j” in the ‘subs’,iso-text
@@ -0,0 +1,3 @@
code,description,specification
3gLZ,For files with brand '3gLZ' where L is a letter and Z a digit and conforming to version Z.x.y of this specification,3GPP
isoX,The 'isoX' brand (where “X” represents a number or letter) specified in each new edition,CMAF
@@ -71,3 +71,6 @@ respective JavaScript files and then injects them in the Vue router as
Vue components with an associated `path`, i.e. the relative URL of the given
page.

## 4CC_Automation/

See the 4CC_Automation/readme.md file for more details about this addition.

0 comments on commit 78a0f68

Please sign in to comment.
You can’t perform that action at this time.