# Apache Tika a content analysis toolkit

The Apache Tika™ toolkit detects and extracts metadata and text from over a thousand different file types (such as PPT, XLS, and PDF). All of these file types can be parsed through a single interface, making Tika useful for search engine indexing, content analysis, translation, and much more. 

# Install
- https://cwiki.apache.org/confluence/display/TIKA/TikaServer#TikaServer-InstallationofTikaServer
- https://github.com/apache/tika-docker  (docker pull apache/tika:latest-full)
- https://pypi.org/project/tika/
- https://tika.apache.org/1.10/formats.html
  
# Documentation 
https://cwiki.apache.org/confluence/display/tika



In [40]:
#! pip install langdetect

In [6]:
import tika
from tika import parser
from tika import language 

In [7]:
string_parsed = parser.from_buffer('hola como estas', 'http://localhost:9998/tika')

In [8]:
string_parsed

{'metadata': {'X-TIKA:Parsed-By': ['org.apache.tika.parser.DefaultParser',
   'org.apache.tika.parser.csv.TextAndCSVParser'],
  'X-TIKA:Parsed-By-Full-Set': ['org.apache.tika.parser.DefaultParser',
   'org.apache.tika.parser.csv.TextAndCSVParser'],
  'X-TIKA:content_handler': 'ToTextContentHandler',
  'Content-Encoding': 'ISO-8859-1',
  'X-TIKA:parse_time_millis': '2',
  'X-TIKA:embedded_depth': '0',
  'X-TIKA:detectedEncoding': 'ISO-8859-1',
  'Content-Length': '15',
  'X-TIKA:encodingDetector': 'UniversalEncodingDetector',
  'Content-Type': 'text/plain; charset=ISO-8859-1'},
 'content': '\n\n\n\n\n\n\n\n\n\n\nhola como estas\n',
 'status': 200}

In [9]:
text = string_parsed["content"]
detected_lang = language.from_buffer(text)
detected_lang

'es'

In [10]:
from __future__ import annotations
from enum import Enum

In [11]:
class Langs(Enum):
    Language_English : str = "The quick brown fox jumps over the lazy dog."
    Language_Arabic : str = "صِف خَلقَ خَودِ كَمِثلِ الشَمسِ إِذ بَزَغَت — يَحظى الضَجيعُ بِها نَجلاءَ مِعطارِ"
    Language_Bulgarian : str = "Ах чудна българска земьо полюшвай цъфтящи жита."
    Language_Catalan : str = "Jove xef porti whisky amb quinze glaçons d’hidrogen coi!"
    Language_Croatian : str = "Gojazni đačić s biciklom drži hmelj i finu vatu u džepu nošnje."
    Language_Czech : str = "Nechť již hříšné saxofony ďáblů rozezvučí síň úděsnými tóny waltzu tanga a quickstepu."
    Language_Danish : str = "Quizdeltagerne spiste jordbær med fløde mens cirkusklovnen Walther spillede på xylofon."
    Language_Esperanto : str = "Laŭ Ludoviko Zamenhof bongustas freŝa ĉeĥa manĝaĵo kun spicoj."
    Language_Estonian : str = "Põdur Zagrebi tšellomängija-följetonist Ciqo külmetas kehvas garaažis"
    Language_Finnish : str = "Hyvän lorun sangen pieneksi hyödyksi jäi suomen kirjaimet."
    Language_French : str = "Portez ce vieux whisky au juge blond qui fume"
    Language_German : str = "Franz jagt im komplett verwahrlosten Taxi quer durch Bayern"
    Language_Greek_Modern : str = "Ταχίστη αλώπηξ βαφής ψημένη γη δρασκελίζει υπέρ νωθρού κυνός"
    Language_Hebrew : str = "דג סקרן שט בים מאוכזב ולפתע מצא חברה dg sqrn šṭ bjM mʾwkzb wlptʿ mṣʾ ḥbrh"
    Language_Hindi : str = "ऋषियों को सताने वाले दुष्ट राक्षसों के राजा रावण का सर्वनाश करने वाले विष्णुवतार भगवान श्रीराम अयोध्या के महाराज दशरथ के बड़े सपुत्र थे।"
    Language_Hungarian : str = "Jó foxim és don Quijote húszwattos lámpánál ülve egy pár bűvös cipőt készít"
    Language_Icelandic : str = "Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa."
    Language_Indonesian : str = "Muharjo seorang xenofobia universal yang takut pada warga jazirah contohnya Qatar."
    Language_Irish : str = "D’fhuascail Íosa Úrmhac na hÓighe Beannaithe pór Éava agus Ádhaimh"
    Language_Italian : str = "Quel vituperabile xenofobo zelante assaggia il whisky ed esclama: alleluja!"
    Language_Japanese : str = "いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす（ん）"
    Language_Javanese : str = "꧋ ꦲꦤꦕꦫꦏ꧈ ꦢꦠꦱꦮꦭ꧈ ꦥꦝꦗꦪꦚ꧈ ꦩꦒꦧꦛꦔ꧉ Hanacaraka datasawala padhajayanya magabathanga."
    Language_Korean : str = "키스의 고유조건은 입술끼리 만나야 하고 특별한 기술은 필요치 않다."
    Language_Latvian : str = "Muļķa hipiji mēģina brīvi nogaršot celofāna žņaudzējčūsku."
    Language_Lithuanian : str = "Įlinkdama fechtuotojo špaga sublykčiojusi pragręžė apvalų arbūzą"
    Language_Macedonian : str = "Ѕидарски пејзаж: шугав билмез со чудење џвака ќофте и кељ на туѓ цех."
    Language_Malay : str = "അജവും ആനയും ഐരാവതവും ഗരുഡനും കഠോര സ്വരം പൊഴിക്കെ ഹാരവും ഒഢ്യാണവും ഫാലത്തില്‍ മഞ്ഞളും ഈറന്‍ കേശത്തില്‍ ഔഷധ എണ്ണയുമായി ഋതുമതിയും അനഘയും ഭൂനാഥയുമായ ഉമ ദുഃഖഛവിയോടെ ഇടതു പാദം ഏന്തി ങ്യേയാദൃശം നിര്‍ഝരിയിലെ ചിറ്റലകളെ ഓമനിക്കുമ്പോള്‍ ബാ‍ലയുടെ കണ്‍കളില്‍ നീര്‍ ഊര്‍ന്നു വിങ്ങി."
    Language_Mongolian : str = "Щётканы фермд пийшин цувъя. Бөгж зогсч хэльюү."
    Language_Norwegian : str = "Vår sære Zulu fra badeøya spilte jo whist og quickstep i min taxi."
    Language_Polish : str = "Jeżu klątw spłódź Finom część gry hańb!"
    Language_Portuguese : str = "Um pequeno jabuti xereta viu dez cegonhas felizes."
    Language_Romanian : str = "Muzicologă în bej vând whisky și tequila preț fix."
    Language_Russian : str = "Эх чужак общий съём цен шляп (юфть) – вдрызг!"
    Language_Serbian : str = "Gojazni đačić s biciklom drži hmelj i finu vatu u džepu nošnje."
    Language_Slovak : str = "Kŕdeľ šťastných ďatľov učí pri ústí Váhu mĺkveho koňa obhrýzať kôru a žrať čerstvé mäso."
    Language_Slovenian : str = "Besni dirkač iz formule žuga cehu poštarjev."
    Language_Spanish : str = "José compró una vieja zampoña en Perú. Excusándose Sofía tiró su whisky al desagüe de la banqueta. esto no va a quedar asi, le dijo el bombero"
    Language_Swedish : str = "Flygande bäckasiner söka hwila på mjuka tuvor."
    Language_Thai : str = "เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอยฯ"
    Language_Turkish : str = "Pijamalı hasta yağız şoföre çabucak güvendi."
    Language_Ukrainian : str = "Жебракують філософи при ґанку церкви в Гадячі ще й шатро їхнє п’яне знаємо."
    Language_Urdu : str = "ٹھنڈ میں، ایک قحط زدہ گاؤں سے گذرتے وقت ایک چڑچڑے، باأثر و فارغ شخص کو بعض جل پری نما اژدہے نظر آئے۔"
    Language_Yoruba : str = "Ìwò̩fà ń yò̩ séji tó gbojúmó̩ ó hàn pákànpò̩ gan-an nis̩é̩ rè̩ bó dò̩la."
    Language_Welsh : str = "Parciais fy jac codi baw hud llawn dŵr ger tŷ Mabon."
    

In [12]:
from langdetect import detect

In [13]:
for lang in Langs:
    string_parsed = parser.from_buffer(lang.value, 'http://localhost:9998/tika')
    text = string_parsed["content"]
    detected_lang = language.from_buffer(text)
    detection = detect(lang.value)
    print(f"Language Original: {lang}, Language detected detect: {detection}, tika language detection {detected_lang}")

Language Original: Langs.Language_English, Language detected detect: en, tika language detection en
Language Original: Langs.Language_Arabic, Language detected detect: ar, tika language detection ar
Language Original: Langs.Language_Bulgarian, Language detected detect: bg, tika language detection bg
Language Original: Langs.Language_Catalan, Language detected detect: ca, tika language detection en
Language Original: Langs.Language_Croatian, Language detected detect: hr, tika language detection hr
Language Original: Langs.Language_Czech, Language detected detect: cs, tika language detection cs
Language Original: Langs.Language_Danish, Language detected detect: da, tika language detection da
Language Original: Langs.Language_Esperanto, Language detected detect: sl, tika language detection sl
Language Original: Langs.Language_Estonian, Language detected detect: et, tika language detection et
Language Original: Langs.Language_Finnish, Language detected detect: fi, tika language detection f

In [14]:
pdf = parser.from_file("Parser Source 2.pdf", 'http://localhost:9998/tika')
text = pdf["content"]
detected_lang = language.from_buffer(text)
detected_lang

'en'

In [15]:
pdf = parser.from_file("Parser Source 1.csv", 'http://localhost:9998/tika')
text = pdf["content"]
detected_lang = language.from_buffer(text)
detected_lang

'en'

In [16]:
path= "contribution_form_Juan_23082023.pdf"

In [19]:
headers = {
    "X-Tika-OCRLanguage": "eng+nor"
}
parsed = parser.from_file(path, headers=headers)

In [20]:
parsed.keys()

dict_keys(['metadata', 'content', 'status'])

In [21]:
print(parsed['metadata'])

{'pdf:PDFVersion': '1.7', 'pdf:docinfo:title': '_Contribution_Form.pdf', 'pdf:hasXFA': 'false', 'X-TIKA:Parsed-By-Full-Set': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.parser.pdf.PDFParser', 'org.apache.tika.parser.ocr.TesseractOCRParser'], 'X-TIKA:content_handler': 'ToTextContentHandler', 'pdf:num3DAnnotations': '0', 'dc:format': 'application/pdf; version=1.7', 'access_permission:fill_in_form': 'true', 'pdf:hasCollection': 'false', 'pdf:encrypted': 'false', 'dc:title': '_Contribution_Form.pdf', 'pdf:containsNonEmbeddedFont': 'false', 'pdf:hasMarkedContent': 'false', 'pdf:ocrPageCount': '8', 'access_permission:can_print_faithful': 'true', 'pdf:docinfo:creator': 'User', 'access_permission:extract_for_accessibility': 'true', 'resourceName': "b'contribution_form_Juan_23082023.pdf'", 'pdf:hasAcroFormFields': 'true', 'signature:name': '1014:Client Cert', 'signature:date': '2023-08-23T06:58:29Z', 'X-TIKA:Parsed-By': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.par

In [22]:
print(parsed['content'])







































_Contribution_Form.pdf


ii SIPP

Contribution form

ce) . .]
HT | interactive
[ l 1of 8

nvestor

How to complete this form

Use this form to make a single personal contribution by bank transfer, set up a regular monthly personal
contribution by direct debit or to arrange contributions from your employer or another person (third party).

| Important: All contributions to your SIPP, together with contributions to any other pensions you may have
count towards your Annual Allowance or, where applicable, the Money Purchase Annual Allowance.

o Once completed, please save this form, and send via secure message from your online account. Having sent
the completed form to us, single payments should be made as detailed in Section 5.

Section 1 - Account holder’s details

Title First name(s) Surname
Mr Juan Salvador Huertas Romero
SIPP Account Number National Insurance Number
5 5 8 3 6 7 5 S Z 1 6 2 6 3 8 C
Are you registered for any form of Lifetime Allowa

In [23]:
path= "data/1a697bdT.tif"

In [24]:
headers = {
    "X-Tika-OCRLanguage": "eng+nor"
}
parsed = parser.from_file(path, headers=headers)

In [25]:
print(parsed['metadata'])

{'Exif Image:Page Number': '5 6', 'Exif IFD0:Fill Order': 'Normal', 'X-TIKA:Parsed-By-Full-Set': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.parser.image.TiffParser', 'org.apache.tika.parser.ocr.TesseractOCRParser'], 'X-TIKA:content_handler': 'ToTextContentHandler', 'Exif IFD0:Unknown tag (0x0124)': '4', 'Exif IFD0:X Resolution': '204 dots per inch', 'tiff:ResolutionUnit': 'Inch', 'Exif Image:New Subfile Type': 'Single page of multi-page image', 'Exif Image:Unknown tag (0x0147)': '0', 'Exif IFD0:Strip Byte Counts': '23687 bytes', 'Exif IFD0:New Subfile Type': 'Single page of multi-page image', 'Exif IFD0:Bits Per Sample': '1 bits/component/pixel', 'tiff:BitsPerSample': '1', 'tiff:SamplesPerPixel': '1', 'Exif IFD0:Unknown tag (0x0147)': '0', 'exif:PageCount': '6', 'resourceName': "b'1a697bdT.tif'", 'Exif Image:Unknown tag (0x0124)': '4', 'Exif Image:Rows Per Strip': '1073 rows/strip', 'Exif Image:X Resolution': '204 dots per inch', 'X-TIKA:Parsed-By': ['org.apache.tika.par

In [26]:
print(parsed['content'])



























































- O67“29700.. 12:09 FAX 212 223 3852

MG METAL & COMMODITY

> MCC All Dept a oo1/”006
EPAsM

#5690 B.001/006

suUN.29'2000 11;14 215 569 8912

Egorov, Puginsky, Afanasiev & Marks, LLC

Moscow Philadelphia St. Petersburg
Krasnapresnemskaya, © 1600 Market Street, 34th Floor Nevsky Pr, 30, Sits Ft
Moscow, Russia Philadelphia, PA LOLOF FSA

Yelephone 7-812-325-S234

Telephone 7-O95-8010 Fras 7-822-219-9243

Telephone 1-2IS— 5G9-B9OL
Fax 7-O95-935-B8022

Fax 2-215-569-8922
Please reply to Philadelphia office

FACSIMILE TRANSDMIITIAL SHEET

TO:

Company: MG Metal & Commodity Corporation

Fex No. (212) 223-3882 Date: Tune 29, 2000

FROM: Ely Goldin, Esquire
RE: Base Metats v. NKAZ ‘Total Pages Inctuding This Cover: 6
O Urgent © For Review U Please Reply i Please Comment 0 Original by Man

Please see attached.

CONFIPENTIALITY NOTICE: This facsimile message otay contain privilered and confidential information
intended only for the use o

In [27]:
from tika import detector

In [28]:
path= "contribution_form_Juan_23082023.pdf"
detector.from_file(path)

'application/pdf'

In [29]:
path= "data/1a697bdT.tif"
detector.from_file(path)

'image/tiff'

In [63]:
from tika import config
print(config.getParsers())


{
  "children" : [ {
    "composite" : false,
    "name" : "org.apache.tika.parser.apple.AppleSingleFileParser",
    "supportedTypes" : [ "application/applefile" ],
    "decorated" : false
  }, {
    "composite" : false,
    "name" : "org.apache.tika.parser.apple.PListParser",
    "supportedTypes" : [ "application/x-plist", "application/x-bplist-itunes", "application/x-bplist", "application/x-bplist-memgraph", "application/x-bplist-webarchive" ],
    "decorated" : false
  }, {
    "composite" : false,
    "name" : "org.apache.tika.parser.asm.ClassParser",
    "supportedTypes" : [ "application/java-vm" ],
    "decorated" : false
  }, {
    "composite" : false,
    "name" : "org.apache.tika.parser.audio.AudioParser",
    "supportedTypes" : [ "audio/vnd.wave", "audio/x-wav", "audio/basic", "audio/x-aiff" ],
    "decorated" : false
  }, {
    "composite" : false,
    "name" : "org.apache.tika.parser.audio.MidiParser",
    "supportedTypes" : [ "application/x-midi", "audio/midi" ],
    "deco

In [64]:
print(config.getMimeTypes())


{
  "application/vnd.ms-package.3dmanufacturing-3dmodel+xml" : {
    "supertype" : "application/xml",
    "alias" : [ ]
  },
  "application/vnd.motorola.flexsuite.fis" : {
    "alias" : [ ]
  },
  "application/vnd.osgi.bundle" : {
    "alias" : [ ]
  },
  "application/vnd.ms-htmlhelp" : {
    "parser" : "org.apache.tika.parser.microsoft.chm.ChmParser",
    "alias" : [ ]
  },
  "application/x-font-snf" : {
    "alias" : [ ]
  },
  "application/x-fat-diskimage" : {
    "alias" : [ ]
  },
  "text/x-log" : {
    "supertype" : "text/plain",
    "alias" : [ ]
  },
  "application/vnd.kidspiration" : {
    "alias" : [ ]
  },
  "application/x-iso9660-image" : {
    "alias" : [ ]
  },
  "audio/l8" : {
    "alias" : [ ]
  },
  "model/vnd.gs.gdl" : {
    "alias" : [ ]
  },
  "application/vnd.iptc.g2.knowledgeitem+xml" : {
    "supertype" : "application/xml",
    "alias" : [ ]
  },
  "audio/x-flac" : {
    "parser" : "org.gagravarr.tika.FlacParser",
    "alias" : [ "audio/flac" ]
  },
  "text/x-vbd

In [65]:
print(config.getDetectors())

{"children":[{"composite":false,"name":"org.gagravarr.tika.OggDetector"},{"composite":false,"name":"org.apache.tika.detect.apple.BPListDetector"},{"composite":false,"name":"org.apache.tika.detect.gzip.GZipSpecializationDetector"},{"composite":false,"name":"org.apache.tika.detect.microsoft.POIFSContainerDetector"},{"composite":false,"name":"org.apache.tika.detect.ole.MiscOLEDetector"},{"composite":false,"name":"org.apache.tika.detect.zip.DefaultZipContainerDetector"},{"composite":false,"name":"org.apache.tika.mime.MimeTypes"}],"composite":true,"name":"org.apache.tika.detect.DefaultDetector"}


In [66]:
# #!/usr/bin/env python
# from tika import translate
# print(translate.from_buffer('Hola Buenos dias, estoy en Londres y hoy es Domingo', 'es', 'en'))

In [30]:
path = "data/HERMANAS (Certificado Nacionalidad).jpg"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)

In [31]:
print(parsed['content'])




































6

Snstitulo Nacional de Cine y Artes Audiovisales

CERTIFICO: que de las constancias del Expediente N° 1059/02/INCAA
caratulado “HERMANAS" surge que la pelicula de largometraje, paso 35mm,
color, titulada “HERMANAS"”, dirigida por Julia SOLOMONOFF, realizada en
caracter de coproduccién por la Productora de la Republica Argentina ZONA
AUDIOVISUAL SRL , inscripta en el Registro de Empresas Productoras del
Organismo bajo el N° 200-653, conjuntamente con la Productora
TORNASOL FILMS S.A. de Espafia; es en virtud de la coproduccién de
nacionalidad argentino-espafiola.

Se extiende el presente, a requerimiento del interesado y para ser presentado
ante quien corresponda, en Buenos Aires a los 16 dias del mes de enero
de 2006.






In [32]:
path = "data/invitation.gif"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])





























You're Invited

toour

Costume Party!
ho: rud, Kris
Whe Ant Thy Moen

Where: Amy § Steve Almrud’s Place
08 Lester, Houston Texas 77007

From I-10, Exit Shepherd/ Durham, Ge South on Durham, Turn
Right on Blossom, Turn Left on Lester.

From b10, Exit Memorial, Go East on Memorial, Turn Left on
‘Westcott, Turn Right on Blossom, Turn Right on Lester.

When: October Z7, L001 at &:20 PM

Rsvp to: almrud@swhell.net
or TIP.869. F188






In [33]:
path = "data/Parser Source 1.xlsx"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])




























Entity Details
	Entity Code	Entity Name (required)	Legal Company Type Type	Status	Registration Number / Tax ID	Incorporation Date	Country	Region / State	Dissolved Date	Historical?	Registered Office Address 	Main Address Line 1
	Text (12)	Text (160)	Text (60)	Text (30)	Text (30)	yyyy-mm-dd			yyyy-mm-dd	TRUE or FALSE	Text (60)	Text (60)
		IRC Holdings	Audit Committee	Active	123456789	2020-08-08	RU	Moscow Oblast		FALSE	15 Red Square, Moscow, 101000, Russia	10 Arbat Street, Moscow, 119019, Russia
		ABC Company	BCA Company	Prior	65545646	2018-07-31	MX	Jalisco	2022-07-20	TRUE	789 Avenida Revolución, Mexico City, CDMX 03840, Mexico	456 Calle Insurgentes, Guadalajara, Jalisco 44100, Mexico
		Yuga Studios	Committee	Active	54646111	2017-04-09	RU	Saint Petersburg		FALSE	23 Nevsky Prospekt, Saint Petersburg, 191186, Russia	78 Bolshaya Morskaya Street, Saint Petersburg, 190000, Russia
		Partner Markets	Committee	Active		2022-08-13	US	California		FALSE	1600 Pennsylvania Av

In [34]:
path = "data/male.wav"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])

None


In [35]:
print(parsed['metadata'])

{'X-TIKA:Parsed-By-Full-Set': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.parser.audio.AudioParser'], 'bits': '16', 'resourceName': "b'male.wav'", 'encoding': 'PCM_SIGNED', 'xmpDM:audioSampleRate': '8000', 'channels': '1', 'X-TIKA:Parsed-By': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.parser.audio.AudioParser'], 'X-TIKA:parse_time_millis': '4', 'X-TIKA:embedded_depth': '0', 'Content-Length': '816496', 'xmpDM:audioSampleType': '16Int', 'Content-Type': 'audio/vnd.wave', 'samplerate': '8000.0'}


In [36]:
path = "data/Welcome back to Planet Earth.mp4"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])

None


In [37]:
print(parsed['metadata'])

{'X-TIKA:EXCEPTION:warn': 'End of data reached.', 'X-TIKA:Parsed-By-Full-Set': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.parser.mp4.MP4Parser'], 'tiff:ImageLength': '360', 'resourceName': "b'Welcome back to Planet Earth.mp4'", 'dcterms:created': '2024-02-01T06:07:52Z', 'dcterms:modified': '2024-02-01T06:07:52Z', 'xmpDM:audioChannelType': 'Stereo', 'xmpDM:audioSampleRate': '44100', 'X-TIKA:Parsed-By': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.parser.mp4.MP4Parser'], 'X-TIKA:parse_time_millis': '68', 'X-TIKA:embedded_depth': '0', 'Content-Length': '4578531', 'tiff:ImageWidth': '640', 'xmpDM:duration': '89.61', 'Content-Type': 'video/mp4'}


In [38]:
path = "data/cj.pdf"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])



































11/14/23, 8:35 PM Clouded Judgement 11.10.23 - by Jamin Ball

https://cloudedjudgement.substack.com/p/clouded-judgement-111023 1/21

Clouded Judgement 11.10.23 - OpenAI
Updates + Datadog Gives the All-Clear?

JAMIN BALL

NOV 10, 2023

2 Share

Every week I’ll provide updates on the latest trends in cloud so�ware companies. Follow along to
stay up to date!

OpenAI Updates

OpenAI had their big developer day this week, and I wanted to call out two key announcements
(and trends): increasing context windows and decreasing costs.

When I think about the monetization of AI (and which “layers” monetize �rst) I’ve always
thought it would follow the below order, with each layer lagging the one that comes before it.

1. Raw silicon (chips like Nvidia bought in large quantities to build out infra to service

upcoming demand).

2. Model providers (OpenAI, Anthropic, etc as companies start building out AI).

35

Type your email... Subscribe

https://substack.com/@c

In [39]:
path = "data/senior_python_developer_nlplogix2_sm.bmp"
headers = {
    "X-Tika-OCRLanguage": "eng+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])































e Excellent analytical skills and strong written and verbal communication skills
Requirements (Nice-to-Haves):

e Experience or exposure to OCR technologies

e Experience or exposure to image processing algorithms

e Experience or exposure to numpy, scipy, opencv and skimage

e Experience or exposure to scikit-learn

e Experience with C#, Java or JavaScript

e Developing, deploying and scaling Docker components

e General exposure and familiarity with Machine Learning technologies

Education:

Bachelor’s degree from four-year college or university, preferred; and five years’
experience in general software development or comparable related experience.

e Experience working with one or more major relational database technology (Oracle,
SQL

Server, MySQL, PostgreSQL)

e Comfortable working in both Windows and Linux based environments

e Ability to work effectively with colleagues

© Ability to work with minimal supervision

¢ Excellent analytical skills and 

In [40]:
path = "data/test.pptx"
headers = {
    "X-Tika-OCRLanguage": "eng+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])























PowerPoint Presentation










image1.jpeg

image2.jpeg

image3.jpeg







































Senior Python Developer
NLP Logix Jacksonville, FL, USA

Benefits Offered

401K, Dental, Life, Medical, Vision
Employment Type
Full-Time

Why Work Here?
“Working for NLP LOGIX will give you a unique opportunity to learn and play an integral role building new and

exciting products.”

Seeking Senior Software Developer for a cutting-edge computer vision product company,
Scribe Fusion (www.scribefusion.com), a division of NLP Logix. The ideal candidate will be
comfortable working with multiple technology stacks and operating systems. Responsibilities
include developing both back-end and front-end components, however the focus will be
primarily on back-end components. Much of the development activities will be focused on new
features and components, however existing product installations need to be maintained as
well. We encourage a collaborative, ego-free en

In [41]:
path = "data/test2.odt"
headers = {
    "X-Tika-OCRLanguage": "eng+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])





































Education:

Bachelor's degree from four-year college or university, preferred; and five years’
experience in generalsoftware development or comparable related experience.

NO H1-b VISAS PLEASE!

About NLP Logix:
NLP LOGIX is a small but fast-growing Data Science firm located in Jacksonville, FL. We build
products and services that leverage the latest in machine learning and statistical technologies.







































e Excellent analytical skills and strong written and verbal communication skills
Requirements (Nice-to-Haves):

e Experience or exposure to OCR technologies

e Experience or exposure to image processing algorithms

e Experience or exposure to numpy, scipy, opencv and skimage

e Experience or exposure to scikit-learn

e Experience with C#, Java or JavaScript

e Developing, deploying and scaling Docker components

e General exposure and familiarity with Machine Learning technologies
Education:

Bachelor's degree from f

In [42]:
path = "data/test.rtf"
headers = {
    "X-Tika-OCRLanguage": "eng+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])













file_0.jpg

file_1.wmf




file_2.jpg

file_3.wmf



file_4.jpg

file_5.wmf



Normal;heading 1;heading 2;heading 3;heading 4;heading 5;heading 6;heading 7;heading 8;heading 9;caption;Title;Subtitle;Strong;Emphasis;Placeholder Text;No Spacing;Light Shading;Light List;Light Grid;Medium Shading 1;Medium Shading 2;Medium List 1;Medium List 2;Medium Grid 1;Medium Grid 2;Medium Grid 3;Dark List;Colorful Shading;Colorful List;Colorful Grid;Light Shading Accent 1;Light List Accent 1;Light Grid Accent 1;Medium Shading 1 Accent 1;Medium Shading 2 Accent 1;Medium List 1 Accent 1;Revision;List Paragraph;Quote;Intense Quote;Medium List 2 Accent 1;Medium Grid 1 Accent 1;Medium Grid 2 Accent 1;Medium Grid 3 Accent 1;Dark List Accent 1;Colorful Shading Accent 1;Colorful List Accent 1;Colorful Grid Accent 1;Light Shading Accent 2;Light List Accent 2;Light Grid Accent 2;Medium Shading 1 Accent 2;Medium Shading 2 Accent 2;Medium List 1 Accent 2;Medium List 2 Accent 2;Medium Grid 1 Accent 2;M

In [44]:
path = "data/test_images.docx"
headers = {
    "X-Tika-OCRLanguage": "eng+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])




































image1.jpeg

image2.jpeg

image3.jpeg






































Senior Python Developer
NLP Logix Jacksonville, FL, USA

Benefits Offered

401K, Dental, Life, Medical, Vision
Employment Type
Full-Time

Why Work Here?
“Working for NLP LOGIX will give you a unique opportunity to learn and play an integral role building new and

exciting products.”

Seeking Senior Software Developer for a cutting-edge computer vision product company,
Scribe Fusion (www.scribefusion.com), a division of NLP Logix. The ideal candidate will be
comfortable working with multiple technology stacks and operating systems. Responsibilities
include developing both back-end and front-end components, however the focus will be
primarily on back-end components. Much of the development activities will be focused on new
features and components, however existing product installations need to be maintained as
well. We encourage a collaborative, ego-free environment with a focus