# Apache Tika a content analysis toolkit

The Apache Tika™ toolkit detects and extracts metadata and text from over a thousand different file types (such as PPT, XLS, and PDF). All of these file types can be parsed through a single interface, making Tika useful for search engine indexing, content analysis, translation, and much more. 

# Install
- https://cwiki.apache.org/confluence/display/TIKA/TikaServer#TikaServer-InstallationofTikaServer
- https://github.com/apache/tika-docker  (docker pull apache/tika:latest-full)
- https://pypi.org/project/tika/
- https://tika.apache.org/1.10/formats.html
  
# Documentation 
https://cwiki.apache.org/confluence/display/tika



In [None]:
#! pip install langdetect

In [None]:
import tika
from tika import parser
from tika import language 

In [None]:
string_parsed = parser.from_buffer('hola como estas', 'http://localhost:9998/tika')

In [None]:
string_parsed

In [None]:
text = string_parsed["content"]
detected_lang = language.from_buffer(text)
detected_lang

In [None]:
from __future__ import annotations
from enum import Enum

In [None]:
class Langs(Enum):
    Language_English : str = "The quick brown fox jumps over the lazy dog."
    Language_Arabic : str = "صِف خَلقَ خَودِ كَمِثلِ الشَمسِ إِذ بَزَغَت — يَحظى الضَجيعُ بِها نَجلاءَ مِعطارِ"
    Language_Bulgarian : str = "Ах чудна българска земьо полюшвай цъфтящи жита."
    Language_Catalan : str = "Jove xef porti whisky amb quinze glaçons d’hidrogen coi!"
    Language_Croatian : str = "Gojazni đačić s biciklom drži hmelj i finu vatu u džepu nošnje."
    Language_Czech : str = "Nechť již hříšné saxofony ďáblů rozezvučí síň úděsnými tóny waltzu tanga a quickstepu."
    Language_Danish : str = "Quizdeltagerne spiste jordbær med fløde mens cirkusklovnen Walther spillede på xylofon."
    Language_Esperanto : str = "Laŭ Ludoviko Zamenhof bongustas freŝa ĉeĥa manĝaĵo kun spicoj."
    Language_Estonian : str = "Põdur Zagrebi tšellomängija-följetonist Ciqo külmetas kehvas garaažis"
    Language_Finnish : str = "Hyvän lorun sangen pieneksi hyödyksi jäi suomen kirjaimet."
    Language_French : str = "Portez ce vieux whisky au juge blond qui fume"
    Language_German : str = "Franz jagt im komplett verwahrlosten Taxi quer durch Bayern"
    Language_Greek_Modern : str = "Ταχίστη αλώπηξ βαφής ψημένη γη δρασκελίζει υπέρ νωθρού κυνός"
    Language_Hebrew : str = "דג סקרן שט בים מאוכזב ולפתע מצא חברה dg sqrn šṭ bjM mʾwkzb wlptʿ mṣʾ ḥbrh"
    Language_Hindi : str = "ऋषियों को सताने वाले दुष्ट राक्षसों के राजा रावण का सर्वनाश करने वाले विष्णुवतार भगवान श्रीराम अयोध्या के महाराज दशरथ के बड़े सपुत्र थे।"
    Language_Hungarian : str = "Jó foxim és don Quijote húszwattos lámpánál ülve egy pár bűvös cipőt készít"
    Language_Icelandic : str = "Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa."
    Language_Indonesian : str = "Muharjo seorang xenofobia universal yang takut pada warga jazirah contohnya Qatar."
    Language_Irish : str = "D’fhuascail Íosa Úrmhac na hÓighe Beannaithe pór Éava agus Ádhaimh"
    Language_Italian : str = "Quel vituperabile xenofobo zelante assaggia il whisky ed esclama: alleluja!"
    Language_Japanese : str = "いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす（ん）"
    Language_Javanese : str = "꧋ ꦲꦤꦕꦫꦏ꧈ ꦢꦠꦱꦮꦭ꧈ ꦥꦝꦗꦪꦚ꧈ ꦩꦒꦧꦛꦔ꧉ Hanacaraka datasawala padhajayanya magabathanga."
    Language_Korean : str = "키스의 고유조건은 입술끼리 만나야 하고 특별한 기술은 필요치 않다."
    Language_Latvian : str = "Muļķa hipiji mēģina brīvi nogaršot celofāna žņaudzējčūsku."
    Language_Lithuanian : str = "Įlinkdama fechtuotojo špaga sublykčiojusi pragręžė apvalų arbūzą"
    Language_Macedonian : str = "Ѕидарски пејзаж: шугав билмез со чудење џвака ќофте и кељ на туѓ цех."
    Language_Malay : str = "അജവും ആനയും ഐരാവതവും ഗരുഡനും കഠോര സ്വരം പൊഴിക്കെ ഹാരവും ഒഢ്യാണവും ഫാലത്തില്‍ മഞ്ഞളും ഈറന്‍ കേശത്തില്‍ ഔഷധ എണ്ണയുമായി ഋതുമതിയും അനഘയും ഭൂനാഥയുമായ ഉമ ദുഃഖഛവിയോടെ ഇടതു പാദം ഏന്തി ങ്യേയാദൃശം നിര്‍ഝരിയിലെ ചിറ്റലകളെ ഓമനിക്കുമ്പോള്‍ ബാ‍ലയുടെ കണ്‍കളില്‍ നീര്‍ ഊര്‍ന്നു വിങ്ങി."
    Language_Mongolian : str = "Щётканы фермд пийшин цувъя. Бөгж зогсч хэльюү."
    Language_Norwegian : str = "Vår sære Zulu fra badeøya spilte jo whist og quickstep i min taxi."
    Language_Polish : str = "Jeżu klątw spłódź Finom część gry hańb!"
    Language_Portuguese : str = "Um pequeno jabuti xereta viu dez cegonhas felizes."
    Language_Romanian : str = "Muzicologă în bej vând whisky și tequila preț fix."
    Language_Russian : str = "Эх чужак общий съём цен шляп (юфть) – вдрызг!"
    Language_Serbian : str = "Gojazni đačić s biciklom drži hmelj i finu vatu u džepu nošnje."
    Language_Slovak : str = "Kŕdeľ šťastných ďatľov učí pri ústí Váhu mĺkveho koňa obhrýzať kôru a žrať čerstvé mäso."
    Language_Slovenian : str = "Besni dirkač iz formule žuga cehu poštarjev."
    Language_Spanish : str = "José compró una vieja zampoña en Perú. Excusándose Sofía tiró su whisky al desagüe de la banqueta. esto no va a quedar asi, le dijo el bombero"
    Language_Swedish : str = "Flygande bäckasiner söka hwila på mjuka tuvor."
    Language_Thai : str = "เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอยฯ"
    Language_Turkish : str = "Pijamalı hasta yağız şoföre çabucak güvendi."
    Language_Ukrainian : str = "Жебракують філософи при ґанку церкви в Гадячі ще й шатро їхнє п’яне знаємо."
    Language_Urdu : str = "ٹھنڈ میں، ایک قحط زدہ گاؤں سے گذرتے وقت ایک چڑچڑے، باأثر و فارغ شخص کو بعض جل پری نما اژدہے نظر آئے۔"
    Language_Yoruba : str = "Ìwò̩fà ń yò̩ séji tó gbojúmó̩ ó hàn pákànpò̩ gan-an nis̩é̩ rè̩ bó dò̩la."
    Language_Welsh : str = "Parciais fy jac codi baw hud llawn dŵr ger tŷ Mabon."
    

In [None]:
from langdetect import detect

In [None]:
for lang in Langs:
    string_parsed = parser.from_buffer(lang.value, 'http://localhost:9998/tika')
    text = string_parsed["content"]
    detected_lang = language.from_buffer(text)
    detection = detect(lang.value)
    print(f"Language Original: {lang}, Language detected detect: {detection}, tika language detection {detected_lang}")

In [None]:
pdf = parser.from_file("Parser Source 2.pdf", 'http://localhost:9998/tika')
text = pdf["content"]
detected_lang = language.from_buffer(text)
detected_lang

In [None]:
pdf = parser.from_file("Parser Source 1.csv", 'http://localhost:9998/tika')
text = pdf["content"]
detected_lang = language.from_buffer(text)
detected_lang

In [None]:
path= "contribution_form_Juan_23082023.pdf"

In [None]:
headers = {
    "X-Tika-OCRLanguage": "eng+nor"
}
parsed = parser.from_file(path, headers=headers)

In [None]:
parsed.keys()

In [None]:
print(parsed['metadata'])

In [None]:
#print(parsed['content'])

In [None]:
path= "data/1a697bdT.tif"

In [None]:
headers = {
    "X-Tika-OCRLanguage": "eng+nor"
}
parsed = parser.from_file(path, headers=headers)

In [None]:
print(parsed['metadata'])

In [None]:
print(parsed['content'])

In [None]:
from tika import detector

In [None]:
path= "contribution_form_Juan_23082023.pdf"
detector.from_file(path)

In [None]:
path= "data/1a697bdT.tif"
detector.from_file(path)

In [None]:
from tika import config
print(config.getParsers())


In [None]:
print(config.getMimeTypes())


In [None]:
print(config.getDetectors())

In [None]:
# #!/usr/bin/env python
# from tika import translate
# print(translate.from_buffer('Hola Buenos dias, estoy en Londres y hoy es Domingo', 'es', 'en'))

In [None]:
path = "data/HERMANAS (Certificado Nacionalidad).jpg"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)

In [None]:
print(parsed['content'])

In [None]:
path = "data/invitation.gif"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])

In [None]:
path = "data/Parser Source 1.xlsx"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])

In [None]:
path = "data/male.wav"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])

In [None]:
print(parsed['metadata'])

In [None]:
path = "data/Welcome back to Planet Earth.mp4"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])

In [None]:
print(parsed['metadata'])

In [None]:
path = "data/cj.pdf"
headers = {
    "X-Tika-OCRLanguage": "eng+esp+nor"
}
parsed = parser.from_file(path, headers=headers)
print(parsed['content'])