In [7]:
import sys
from PyQt5 import QtWidgets, QtCore, QtGui
from PyQt5.QtWidgets import (QSystemTrayIcon, QApplication, QLabel, QMainWindow, QPushButton, QWidget, 
                             QVBoxLayout, QTextEdit, QDialog, QComboBox, QAction, QLineEdit, QLabel, QInputDialog)
from PyQt5.QtGui import QIcon, QFont
from PyQt5.QtCore import Qt, QCoreApplication, QSettings, QPoint
import tkinter as tk
import PIL.Image
import numpy as np
import cv2
import io
import json
import inspect
import ctypes
from io import BytesIO
import win32clipboard
import glob
import os
from matplotlib import pyplot as plt
from IPython.display import clear_output, Image, display
from google.cloud.vision import types
from google.cloud import vision
import mss
import pyperclip
import mss.tools
import qt_utils
from PyQt5.QtWidgets import QMessageBox
from google.cloud import speech

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/poorv/Downloads/ocr_scripts/keys/direct-outlook-270501-c05c2d97d1c6.json"

In [8]:
def send_text_clipboard(text):
    pyperclip.copy(text)
    spam = pyperclip.paste()
    
def dtlh(path, lh):
    """Detects text in the file."""
    client = vision.ImageAnnotatorClient()
    if type(path) == str:
        with io.open(path, 'rb') as image_file:
            content = image_file.read()
            image = vision.types.Image(content=content)
    else:
        image = types.Image(content=cv2.imencode('.jpg', path)[1].tostring())
        
    response = client.text_detection(
    image=image,
    image_context={"language_hints": lh},
    )
    texts = response.text_annotations
    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
        
        print('Texts:')

    for text in texts:
        print('\n"{}"'.format(text.description))

        vertices = (['({},{})'.format(vertex.x, vertex.y)
                    for vertex in text.bounding_poly.vertices])

        print('bounds: {}'.format(','.join(vertices)))
    try:
        return texts[0].description
    except IndexError:
        print('no text in image')

In [9]:
class ocrWidget(QtWidgets.QWidget):
#     clicked = QtCore.pyqtSignal(object)
    
    def __init__(self, parent = None, lang_hint = []):
        super().__init__()
        root = tk.Tk()
        screen_width = root.winfo_screenwidth()
        screen_height = root.winfo_screenheight()
        self.setGeometry(0, 0, screen_width, screen_height)
        self.setWindowTitle(' ')
        self.lang_hint = lang_hint
        self.begin = QtCore.QPoint()
        self.end = QtCore.QPoint()
        self.parent = parent
        self.setWindowOpacity(0.3)
        QtWidgets.QApplication.setOverrideCursor(
            QtGui.QCursor(QtCore.Qt.CrossCursor)
        )
        self.setWindowFlags(QtCore.Qt.FramelessWindowHint)
        QtWidgets.QShortcut(
            QtGui.QKeySequence("Escape"), self, activated=self.on_Escape
        )

    def paintEvent(self, event):
        qp = QtGui.QPainter(self)
        qp.setPen(QtGui.QPen(QtGui.QColor('black'), 3))
        qp.setBrush(QtGui.QColor(128, 128, 255, 128))
        qp.drawRect(QtCore.QRect(self.begin, self.end))

    def mousePressEvent(self, event):
        self.begin = event.pos()
        self.end = self.begin
        self.update()
    
    @QtCore.pyqtSlot()
    def on_Escape(self):
        print("main esp exit")
#         self.close()
        self.closeAndReturn()
        

    def mouseMoveEvent(self, event):
        self.end = event.pos()
        self.update()
    
    def closeAndReturn(self):
        self.close()
        self.parent.setWindowOpacity(1.)
        return

    def mouseReleaseEvent(self, event):
        previous = self.parent
        self.close()

        x1 = min(self.begin.x(), self.end.x())
        y1 = min(self.begin.y(), self.end.y())
        x2 = max(self.begin.x(), self.end.x())
        y2 = max(self.begin.y(), self.end.y())
        
        if (y1 == y2 and x1 == x2):
            print("no region selected")
            self.closeAndReturn()
        
        with mss.mss() as sct:
            print("lang_hint:", self.lang_hint)
            # The screen part to capture            
            monitor = {"top": y1, "left":x1, "width": abs(x2 - x1) , "height": abs(y2 - y1)}
            output = "sct-{top}x{left}_{width}x{height}.png".format(**monitor)
            sct_img = sct.grab(monitor)
            try:
                img = PIL.Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
                img = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2RGB)
                im_gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
                ret,thresh = cv2.threshold(im_gray,127,255,cv2.THRESH_BINARY_INV)
#                 ocr = dtlh(thresh, ["hi"])
                ocr = dtlh(thresh, self.lang_hint)
                popup = False
                if ocr:
                    send_text_clipboard(ocr)
                    print(ocr)
                    if popup:
                        msg = QMessageBox()
                        msg.setText(ocr)
                        msg.setWindowTitle("Captured Text")
                        msg.setFont(QFont('Arial', 20)) 
                        x = msg.exec_()  # this will show our messagebox
                self.parent.setWindowOpacity(1.)
            except ValueError:
                print("invalid region selection. try again")
                self.closeAndReturn()

In [10]:
lang_hints_dict = {u'Afrikaans - Afrikaans': "af", 
u'Albanian - shqip': "sq", 
u'Arabic - العربية': "ar", 
u'Armenian - Հայ': "hy", 
u'Belorussian - беларускі': "be", 
u'Bulgarian - български': "bg", 
u'Catalan - Català': "ca", 
u'Chinese - 普通话': "zh", 
u'Croatian - Hrvatski': "hr", 
u'Czech - Čeština': "cs", 
u'Danish - Dansk': "da", 
u'Dutch - Nederlands': "nl", 
u'English  - English': "en", 
u'Estonian - Eesti keel': "et", 
u'Filipino - Filipino': "fil (or tl)", 
u'Finnish - Suomi': "fi", 
u'French  - Français': "fr", 
u'German - Deutsch': "de", 
u'Greek - Ελληνικά': "el", 
u'Gujarati - ગુજરાતી': "gu", 
u'Hebrew - עברית': "iw", 
u'Hindi - हिन्दी': "hi", 
u'Hungarian - Magyar': "hu", 
u'Icelandic - Íslenska': "is", 
u'Indonesian - Bahasa Indonesia': "id", 
u'Italian - Italiano': "it", 
u'Japanese - 日本語': "ja", 
u'Kannada - ಕನ್ನಡ': "kn", 
u'Khmer - ភាសាខ្មែរ': "km", 
u'Korean - 한국어': "ko", 
u'Lao - ລາວ': "lo", 
u'Latvian - Latviešu': "lv", 
u'Lithuanian - Lietuvių': "lt", 
u'Macedonian - Македонски': "mk", 
u'Malay - Bahasa Melayu': "ms", 
u'Malayalam - മലയാളം': "ml", 
u'Marathi - मराठी': "mr", 
u'Nepali - नेपाली': "ne", 
u'Norwegian - Norsk': "no", 
u'Persian - فارسی': "fa", 
u'Polish - Polski': "pl", 
u'Portuguese - Português': "pt", 
u'Punjabi - ਪੰਜਾਬੀ': "pa", 
u'Romanian - Română': "ro", 
u'Russian - Русский': "ru", 
u'Russian - Русский (старая орфография)': "ru-PETR1708", 
u'Serbian - Српски': "sr", 
u'Serbian - Српски (латиница)': "sr-Latn", 
u'Slovak - Slovenčina': "sk", 
u'Slovenian - Slovenščina': "sl", 
u'Spanish - Español': "es", 
u'Swedish - Svenska': "sv", 
u'Tamil - தமிழ்': "ta", 
u'Telugu - తెలుగు': "te", 
u'Thai - ไทย': "th", 
u'Turkish - Türkçe': "tr", 
u'Ukrainian - Українська': "uk", 
u'Vietnamese - Tiếng Việt': "vi", 
u'Yiddish - Yiddish': "yi"}

class Central(qt_utils.QMainWindow):
    def __init__(self, parent=None):
        super(Central, self).__init__(parent)
        self.pushButton = QPushButton("OCR")
        self.pushButton.clicked.connect(self.on_pushButton_clicked)
        self.dialogs = list()
        self.combo = QComboBox(self)
        self.populate_dict(self.combo)
        self.codes = set(lang_hints_dict.values())
        self.combo.activated.connect(self.handleActivated)
        self.lang_hint = []    
        self.multiButton = QPushButton("OCR Multiple languages")
        self.multiButton.clicked.connect(self.sd)
        self.selected = []
        layout = QVBoxLayout()
        layout.addWidget(self.combo)
        layout.addWidget(self.pushButton)
        layout.addWidget(self.multiButton)
        widget = QWidget()
        widget.setLayout(layout)
        widget.setFont(QFont('Arial', 11)) 
        self.setCentralWidget(widget)
        self._gui_restore()
    
    def on_pushButton_clicked(self, mult = False):
        if not mult:
            self.lang_hint = [lang_hints_dict.get(str(self.combo.currentText()))]
        dialog = ocrWidget(self, self.lang_hint)
        self.dialogs.append(dialog)
        dialog.show()
        self.setWindowOpacity(0.)
    
    def handleActivated(self, index):
        self.lang_hint = [self.combo.itemData(index)]
        
    def populate_dict(self, combo : QComboBox):
        for key, value in lang_hints_dict.items():
            self.combo.addItem(key, value)
            
    def sd(self):
        info_string = """To enter multiple languages, enter the language code in a comma seperate list in order of importance. For example, 
    if you want to recognize a document with Arabic, English, and French in it, and (Arabic being most and French least important)
    enter ar,en,fr. For the full list of language codes see https://cloud.google.com/vision/docs/languages#supported-langs
    """
        text , ok = QInputDialog.getText(self,'InputDialog', info_string)
        if ok:
            lh = text.replace(" ", "").split(',')
            self.lang_hint.clear()
            for elem in lh:
                if elem not in self.codes:
                    print("the language code {0} cannot be found! check your codes!".format(elem))
                    return
                else:
                    self.lang_hint.append(elem)
            self.on_pushButton_clicked(True)

    
def main():
    app = QtCore.QCoreApplication.instance()
    if app is None:
        app = QApplication(sys.argv)
    main = Central()
    main.show()
    sys.exit(app.exec_())
    
if __name__ == '__main__':
    main()

lang_hint: ['hi']

"। दावण म समुद्र क किनार काफी वषा हाता
वों में\nरहती है। भारत के चार प्रमुख महानगर हैं
ट और कलकत्ता\nपूर्वी तट के मुख्य बंदरगाह
और चण्डीगढ़ भारत के कुछ अन्य\nबड़े शहर हैं
हीं रहा। यातायात के प्रमुख साधन हैं :\nरेलगाडि
सुन्दर मन्दिर और इमारतें हैं, जिन्हें देखने के लिं
"
bounds: (-8,0),(773,0),(773,260),(-8,260)

"।"
bounds: (16,0),(31,0),(31,36),(16,36)

"दावण"
bounds: (46,0),(142,0),(142,36),(46,36)

"म"
bounds: (161,0),(188,0),(188,36),(161,36)

"समुद्र"
bounds: (203,0),(287,0),(287,36),(203,36)

"क"
bounds: (312,0),(339,0),(339,36),(312,36)

"किनार"
bounds: (362,0),(454,0),(454,36),(362,36)

"काफी"
bounds: (479,0),(560,0),(560,36),(479,36)

"वषा"
bounds: (586,0),(636,0),(636,36),(586,36)

"हाता"
bounds: (663,0),(722,0),(722,36),(663,36)

"वों"
bounds: (-7,24),(20,24),(19,73),(-8,73)

"में"
bounds: (38,24),(52,24),(51,73),(37,73)

"\"
bounds: (67,24),(81,24),(80,73),(66,73)

"n"
bounds: (91,25),(105,25),(104,74),(90,74)

"रहती"
bounds: (110,25),(179,26),(178,75),(1

SystemExit: 0

In [None]:

#somehow remeber the favorite langs selected
#create keyboard shorcuts??
#consider logic for setting multuple lang hints (i.e. checkbox) 
#settings menu to set quick access langs i.e. english, spanish, french 

In [22]:
dtlh(r"C:\Users\poorv\Downloads\hindi\१००A\hw4\Pages from Intermediate Hindi Reader (Hindi and English Edition)  _Page_2.png", ["hi"])

'उत्तर से कम । दक्षिण में समुद्र के किनारे काफ़ी वर्षा होती है और पैदावार अच्छी होती\nहै।\nयह\nभारत एक कृषि प्रधान देश है और देश की लगभग ७० प्रतिशत आबादी गाँवों में\nरहती है। भारत के चार प्रमुख महानगर हैं: नई दिल्ली, बम्बई, कलकत्ता और मद्रास ।\nभारत की राजधानी, नई दिल्ली, उत्तर में स्थित है। बम्बई पश्चिमी तट और कलकत्ता\nपूर्वी तट के मुख्य बंदरगाह हैं। मद्रास दक्षिण का सबसे बड़ा शहर है और भी\nसमुद्र\nके\nकिनारे बसा है। बंगलोर, हैदराबाद, लखनऊ, जयपुर और चण्डीगढ़ भारत के कुछ अन्य\nबड़े शहर हैं। सबसे पुराना शहर वाराणसी, या बनारस, गंगा के तट पर बसा है।\nभारत में सफर करना अब बहुत मुश्किल नहीं रहा। यातायात के प्रमुख साधन हैं :\nरेलगाड़ियाँ और सड़कें। आजकल हम काफ़ी शहरों को हवाई जहाज़ से भी जा सकते हैं ।\nभारत में प्राचीन काल के बहुत सुन्दर मन्दिर और इमारतें हैं, जिन्हें देखने के लिए दूर-दूर\nसे लोग आते हैं । अजंता और एलोरा की गुफाएँ और आगरे का ताजमहल तो सारी दुनिया\nमें प्रसिद्ध हैं।\nभारत ने १५ अगस्त १९४७ को इंगलैंड से स्वतंत्रता पाई और २६ जनवरी १९५० को\nइसका संविधान लागू हुआ । भारत के विभिन्

In [10]:
def transcribe_file(speech_file):
    """Transcribe the given audio file asynchronously."""
    from google.cloud import speech

    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="hi-IN",
    )

    operation = client.long_running_recognize(
        request={"config": config, "audio": audio}
    )

    print("Waiting for operation to complete...")
    response = operation.result(timeout=120)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))


In [13]:

sam = r"C:\Users\poorv\Downloads\hindi\१००A\hw5\01_11_Goswamy.flac"

image.source.image_uri = "gs://boocket_head/audios/01_11_Vijay.flac"

In [27]:
transcribe_file("https://storage.cloud.google.com/boocket_head/audios/01_11_Vijay.flac?authuser=3")

OSError: [Errno 22] Invalid argument: 'https://storage.cloud.google.com/boocket_head/audios/01_11_Vijay.flac?authuser=3'

In [24]:
from google.cloud import speech_v1 as speech


def speech_to_text(config, audio):
    client = speech.SpeechClient()
    response = client.recognize(config, audio)
    print_sentences(response)


def print_sentences(response):
    for result in response.results:
        best_alternative = result.alternatives[0]
        transcript = best_alternative.transcript
        confidence = best_alternative.confidence
        print('-' * 80)
        print(f'Transcript: {transcript}')
        print(f'Confidence: {confidence:.0%}')


config = {'language_code': 'hi-IN'}
audio = {'uri': 'gs://boocket_head/audios/01_11_Vijay.flac'}
speech_to_text(config, audio)

TypeError: recognize() takes from 1 to 2 positional arguments but 3 were given

In [20]:
speech_to_text(c1,c2)

TypeError: recognize() takes from 1 to 2 positional arguments but 3 were given

In [22]:
transcribe_file(c2['uri'])

OSError: [Errno 22] Invalid argument: 'gs://boocket_head/audios/01_11_Vijay.flac'

In [25]:
def google_transcribe(audio_file_name):
    
    file_name = filepath + audio_file_name
    mp3_to_wav(file_name)

    # The name of the audio file to transcribe
    
    frame_rate, channels = frame_rate_channel(file_name)
    
    if channels > 1:
        stereo_to_mono(file_name)
    
    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name
    
    upload_blob(bucket_name, source_file_name, destination_blob_name)
    
    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''
    
    client = speech.SpeechClient()
    audio = types.RecognitionAudio(uri=gcs_uri)

    config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=frame_rate,
    language_code='en-US',
    enable_speaker_diarization=True,
    diarization_speaker_count=2) #Changed

    # Detects speech in the audio file
    operation = client.long_running_recognize(config, audio)
    response = operation.result(timeout=10000)
    result = response.results[-1] #Changed
    words_info = result.alternatives[0].words #Changed
    
    tag=1 #Changed
    speaker="" #Changed

    for word_info in words_info: #Changed
        if word_info.speaker_tag==tag: #Changed
            speaker=speaker+" "+word_info.word #Changed
        else: #Changed
            transcript += "speaker {}: {}".format(tag,speaker) + '\n' #Changed
            tag=word_info.speaker_tag #Changed
            speaker=""+word_info.word #Changed
 
    transcript += "speaker {}: {}".format(tag,speaker) #Changed
    
    delete_blob(bucket_name, destination_blob_name)
    return transcript

In [26]:
google_transcribe(sam)

NameError: name 'filepath' is not defined

In [None]:
https://storage.cloud.google.com/boocket_head/audios/01_11_Vijay.flac?authuser=3

In [32]:
# [START speech_transcribe_async_gcs]
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech_v1 import enums
    from google.cloud.speech_v1 import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=48000,
        language_code='de-DE')

    operation = client.long_running_recognize(config, audio)

    print('Transcribing your cloud-stored audio file...')
    response = operation.result(timeout=1800)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        print('Confidence: {}'.format(result.alternatives[0].confidence))
# [END speech_transcribe_async_gcs]

In [33]:
ud = "gs://boocket_head/audios/01_11_Vijay.flac"

In [35]:
transcribe_gcs(ud)

ImportError: cannot import name 'enums' from 'google.cloud.speech_v1' (C:\ProgramData\Anaconda3\lib\site-packages\google\cloud\speech_v1\__init__.py)

In [11]:
def pretty_print(string):
    split = string.split()
    print(split)