In [2]:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import fasttext
import fasttext.util
import numpy as np
import csv

In [3]:
#Loading model and function for identifying language
model = fasttext.load_model('lid.176.bin')

def get_text():
    input_text = entry.get()
    condition = 0
    identify_language(input_text, condition)
  
def identify_language(text, condition):
    text = text.replace('\n', '')
    confidence_threshold = 0.5
    predictions = model.predict(text, k=3)
    labels = [label.split('__label__')[1] for label in predictions[0]]
    confidences = [float(confidence) for confidence in predictions[1]]
    chosen_index = np.argmax(confidences)
    language_code = labels[chosen_index]
    confidence_score = confidences[chosen_index]

    if (condition == 0):
        if (language_code == "tl" or language_code == "en") and (confidence_score >= confidence_threshold):
        # Display the result in the label without mapping
            result_label.config(text=f"Predicted language: {language_code} (Confidence: {confidence_score:.2f})")

        elif (labels[0] == "en" or labels[0] == "tl") and (labels[1] == "en" or labels[1] == "tl") and (confidence_score < confidence_threshold):
            # For code-mixed Tagalog-English, return both language codes and their confidence scores
            #lan = "en and tl"
            #result_label.config(text=f"Predicted languages: {lan} | {language_code} (Confidence: {confidence_score:.2f}) {language_code} (Confidence: {confidence_score:.2f})")
            result_label.config(text=f"Predicted languages: {language_code} (Confidence: {confidences[0]:.2f}), {labels[1]} (Confidence: {confidences[1]:.2f}), {labels[2]} (Confidence: {confidences[2]:.2f})")
        
        else:
            result_label.config(text=f"Predicted languages: {language_code} (Confidence: {confidences[0]:.2f}), {labels[1]} (Confidence: {confidences[1]:.2f}), {labels[2]} (Confidence: {confidences[2]:.2f})")
            #result_label.config(text=f"Predicted languages: Not applicable, language is either not tagalog or english or it has not reached confidence threshold")
    
    
    
    else:
        filipino_labels = ["tl", "ceb", "ilo", "hil", "war", "pam", "pag"]
        main_labels = ["en","tl"]
        allowed_labels = ["en","tl", "ceb", "ilo", "hil", "war", "pam", "pag"]
        
 
        #if labels[0] == "en" and ()sum(label in filipino_labels for label in labels)) >= 1:
        
        # EN > TL, TAGLISH
        if labels[0] == "en" and labels.count("tl") == 1:
            lan = labels[0]
            if sum(label in filipino_labels for label in labels[1:]) > 1:
                fil_confidence = sum(confidences[1:])
                if fil_confidence > confidences[0]:
                    lan = "Filipino"
                    
        # TL > EN, TAGLISH           
        elif labels[0] == "tl" and labels.count("en") == 1:
            lan = "Filipino"
                    
        # Main label is captured, NOT TAGLISH
        elif (labels[0] in main_labels) and (confidence_score >= confidence_threshold):
            lan = language_code
            if (lan == "tl"):
                lan = "Filipino"
                
        #CONTAINS TWO FILIPINO LABELS AND CONTAINS TL
        #add sum of filipino labels and compare it to top language
        elif labels.count("tl") == 1 and sum(label in filipino_labels for label in labels) >= 2:
            lan = language_code
            if labels[0] not in filipino_labels:
                filipino_confidence = sum(confidences[i] for i, label in enumerate(labels) if label in filipino_labels)
                if filipino_confidence > confidences[0]:
                    lan = "Filipino"
                else:
                    lan = "n/a"
            else:
                lan = "Filipino"  
                
        # Top is not English or TL, but contains English or Tagalog, check if included main label >= confidence threshold
        elif (labels[0] not in allowed_labels) and sum(label in main_labels for label in labels[1:]) >= 1:
            lan = "n/a"
            if labels[confidences.index(max(confidences))] in main_labels >= confidence_threshold:
                lan = labels[labels.index(max(confidences))]
                if (lan in filipino_labels):
                    lan = "Filipino"
            
        else:
            lan = "n/a"
            
        if (lan == "en"):
            lan = "English"
            
        return lan



In [4]:
#Reference: https://www.w3resource.com/python-exercises/tkinter/python-tkinter-dialogs-and-file-handling-exercise-9.php
def check_file():
    file_path = filedialog.askopenfilename(title="Open CSV File", filetypes=[("CSV files", "*.csv")])
    return file_path

def open_csv_file():
    #access the function
    global analyze_button
    directory = check_file() 
    if directory:
        display_csv_data(directory)
        if analyze_button is None:
            analyze_button = tk.Button(root, text="Analyze CSV File", command=lambda directory=directory: analyze_csv_file(directory))
            analyze_button.pack(side=tk.TOP, padx=20, pady=10)
        else:
            analyze_button.config(command=lambda directory=directory: analyze_csv_file(directory))


    
def analyze_csv_file(directory):
    input_file = directory
    output_file = directory.replace('.csv', '-FT-0.5.csv')
    with open(input_file, 'r', newline='') as infile, \
         open(output_file, 'w', newline='') as outfile:

        condition = 1
        csv_reader = csv.reader(infile, delimiter=',')
        header = next(csv_reader)
        header.append('0.5')
        csv_output = csv.writer(outfile, delimiter=',')
        csv_output.writerow(header)
        
        for row in csv_reader:
            if len(row) > 0:
                csvComment = row[0]
                commentLanguage = identify_language(csvComment, condition)
                row.append(commentLanguage)
                csv_output.writerow(row)
        tk.messagebox.showinfo("File Processed", f"Output file created: {output_file}")

    
def display_csv_data(directory):
    try:
        with open(directory, 'r', newline='') as file:
            csv_reader = csv.reader(file, delimiter=',')
            header = next(csv_reader)  # Read the header row
            tree.delete(*tree.get_children())  # Clear the current data

            tree["columns"] = header
            for col in header:
                tree.heading(col, text=col)
                tree.column(col, width=100)

            for row in csv_reader:
                tree.insert("", "end", values=row)

    except Exception as e:
        status_label.config(text=f"Error: {str(e)}")

In [7]:
#GUI
root = tk.Tk()
root.title("FastText Language Tagging")

frame = ttk.Frame(root, padding="10")
frame.pack(fill="both", expand=True, padx=10, pady=10)

label = ttk.Label(frame, text="Enter text:", anchor='w', font=("TkDefaultFont", 10))
label.pack(padx=5, pady=5, anchor='w')

entry = ttk.Entry(frame, width=60) 
entry.pack(padx=5, pady=5, side=tk.TOP, fill=tk.BOTH)

identify_button = tk.Button(frame, text="Identify Language", command=get_text)
identify_button.pack(padx=5, pady=5)

result_label = tk.Label(frame, text="", font=("TkDefaultFont", 15))
result_label.pack(pady=10)

open_button = tk.Button(root, text="Open CSV File", command=open_csv_file)
open_button.pack(side=tk.TOP, padx=20, pady=10)

tree = ttk.Treeview(root, show="headings")
tree.pack(padx=20, pady=20, fill="both", expand=True)

analyze_button = None
root.mainloop()