In [None]:
from collections import defaultdict
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix
import numpy as np
import speech_recognition as sr

In [None]:
data = pd.read_csv('/content/sample_data/sentences.csv', sep='\t',encoding='utf8', index_col=0, names=['lang','text'], error_bad_lines = False)
lang = ['deu', 'eng', 'fra', 'ita', 'por', 'spa']
data = data[data['lang'].isin(lang)]

data_trim = pd.DataFrame(columns=['lang', 'text'])

for l in lang:
    language_trim = data[data['lang'] == l]
    if len(language_trim) > 50000:
        language_trim = language_trim.sample(50000, random_state=100)
    data_trim = pd.concat([data_trim, language_trim])

#create random train, valid, test split
data_shuffle = data_trim.sample(frac=1, random_state=100)

train = data_shuffle.iloc[:210000]
valid = data_shuffle.iloc[210000:270000]
test = data_shuffle.iloc[270000:]

# New Section

In [None]:
from collections import Counter

def get_trigrams(corpus,n_feat=200):
    #fit the n-gram model
    vectorizer = CountVectorizer(analyzer='char',
                            ngram_range=(3, 3)
                            ,max_features=n_feat)
    
    X = vectorizer.fit_transform(corpus)
    
    #Get model feature names
    feature_names = vectorizer.get_feature_names_out()
    
    return feature_names

In [None]:
features = {}
features_set = set()

for l in lang:
    
    #get corpus filtered by language
    corpus = train[train.lang==l]['text']
    
    #get 200 most frequent trigrams
    trigrams = get_trigrams(corpus)
    
    #add to dict and set
    features[l] = trigrams 
    features_set.update(trigrams)

    
#create vocabulary list using feature set
vocab = dict()
for i,f in enumerate(features_set):
    vocab[f]=i

In [None]:
#train count vectoriser using vocabulary
vectorizer = CountVectorizer(analyzer='char',
                             ngram_range=(3, 3),
                            vocabulary=vocab)

#create feature matrix for training set
corpus = train['text']   
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()

train_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)

In [None]:
#Scale feature matrix 
train_min = train_feat.min()
train_max = train_feat.max()
train_feat = (train_feat - train_min)/(train_max-train_min)

#Add target variable 
train_feat['lang'] = list(train['lang'])


In [None]:
#create feature matrix for validation set
corpus = valid['text']   
X = vectorizer.fit_transform(corpus)

valid_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
valid_feat = (valid_feat - train_min)/(train_max-train_min)
valid_feat['lang'] = list(valid['lang'])

#create feature matrix for test set
corpus = test['text']   
X = vectorizer.fit_transform(corpus)

test_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
test_feat = (test_feat - train_min)/(train_max-train_min)
test_feat['lang'] = list(test['lang'])

In [None]:
langs = ['deu', 'eng', 'fra', 'ita', 'por', 'spa']

# Create a LabelEncoder object
encoder = LabelEncoder()

# Fit the encoder to the list of languages
encoder.fit_transform(langs)

def encode(y):
    
    y_encoded = encoder.transform(y)
    y_dummy = np_utils.to_categorical(y_encoded)
    
    return y_dummy

In [None]:
x = train_feat.drop('lang',axis=1)
y = encode(train_feat['lang'])

#Define model
model = Sequential()
model.add(Dense(500, input_dim=670, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train model
model.fit(x, y, epochs=4, batch_size=None)

In [None]:
# Initialize a SpeechRecognition recognizer
!pip install PyAudio‑0.2.11‑cp37‑cp37m‑win_amd64.whl

r = sr.Recognizer()

# Record audio from the microphone
with sr.Microphone() as source:
    print("Speak now...")
    audio = r.listen(source)

# Use Google's speech recognition to get the spoken text
text = r.recognize_google(audio, language=langs)

# Preprocess the spoken input
text = [text]

# Vectorize the spoken input using the same vectorizer used for training
input_matrix = vectorizer.transform(text)

# Scale the input matrix using the minimum and maximum values from the training data
input_matrix = (input_matrix - train_min) / (train_max - train_min)

# Predict the language of the spoken input
predicted_lang = encoder.inverse_transform(np.argmax(model.predict(input_matrix), axis=-1))[0]

# Print the predicted language
print("Spoken language: ", predicted_lang)

In [None]:
# Initialize CountVectorizer with the vocabulary
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 3), vocabulary=vocab)

# Fit and transform the training corpus using the vectorizer
input_text = input("Enter a sentence: ")
train_feat_matrix = vectorizer.transform([input_text])
input_feat = pd.DataFrame(data=train_feat_matrix.toarray(),columns=feature_names)
input_feat = (input_feat - train_min)/(train_max-train_min)

input_pred = model.predict(input_feat)
input_lang = encoder.inverse_transform(np.argmax(input_pred, axis=-1))

print("The language of the input sentence is:", input_lang[0])

In [None]:
x_test = test_feat.drop('lang', axis=1)
y_test = test_feat['lang']

# Load the model from the saved file
#model = tf.keras.models.load_model('my_model.h5')

#Get predictions on test set
labels = np.argmax(model.predict(x_test), axis=-1)
predictions = encoder.inverse_transform(labels)

#Accuracy on test set
accuracy = accuracy_score(y_test,predictions)
print(accuracy)

#Create confusion matrix
lang = ['eng','fra','spa', 'hin', 'ara', 'jpn']
conf_matrix = confusion_matrix(y_test,predictions)
conf_matrix_df = pd.DataFrame(conf_matrix,columns=lang,index=lang)

#Plot confusion matrix heatmap
plt.figure(figsize=(10, 10), facecolor='w', edgecolor='k')
sns.set(font_scale=1.5)
sns.heatmap(conf_matrix_df,cmap='coolwarm',annot=True,fmt='.5g',cbar=False)
plt.xlabel('Predicted',fontsize=22)
plt.ylabel('Actual',fontsize=22)
