In [None]:
from sklearn.ensemble import VotingClassifier
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
from sklearn.ensemble import VotingClassifier
# from keras.models import load_model
import joblib

svm_model_path = '/content/drive/MyDrive/Colab Notebooks/svm_model.pkl'
nb_model_path = '/content/drive/MyDrive/Colab Notebooks/nb_model.pkl'

# Load the SVM and Naive Bayes models from the files in your Google Drive
svm_model = joblib.load(svm_model_path)
naive_bayes_model = joblib.load(nb_model_path)


In [None]:
import pandas as pd

# Load JSON data from the URL into a DataFrame
url = 'https://raw.githubusercontent.com/SunbirdAI/salt/main/v1.2/salt-train-v1.2.jsonl'
df = pd.read_json(url, lines=True)

# Display the DataFrame
print(df)


                                                    text tts-speech
0      {'eng': 'It was not a ghost refugee camp.', 'l...        NaN
1      {'eng': 'I want to go to town over the weekend...        NaN
2      {'eng': 'I have high blood pressure.', 'lug': ...        NaN
3      {'eng': 'You need to have priorities in life.'...        NaN
4      {'eng': 'It's a good practice to help those in...        NaN
...                                                  ...        ...
23942  {'eng': 'It has all happened in the intervenin...        NaN
23943  {'eng': 'Many people have recovered from coron...        NaN
23944  {'eng': 'The government will provide support t...        NaN
23945  {'eng': 'There are many things that we need to...        NaN
23946  {'eng': 'The incumbent is worried about the in...        NaN

[23947 rows x 2 columns]


In [None]:
text_column = df['text']
print(text_column)

0        {'eng': 'It was not a ghost refugee camp.', 'l...
1        {'eng': 'I want to go to town over the weekend...
2        {'eng': 'I have high blood pressure.', 'lug': ...
3        {'eng': 'You need to have priorities in life.'...
4        {'eng': 'It's a good practice to help those in...
                               ...                        
23942    {'eng': 'It has all happened in the intervenin...
23943    {'eng': 'Many people have recovered from coron...
23944    {'eng': 'The government will provide support t...
23945    {'eng': 'There are many things that we need to...
23946    {'eng': 'The incumbent is worried about the in...
Name: text, Length: 23947, dtype: object


In [None]:
# Drop the 'tts-speech' column
df.drop('tts-speech', axis=1, inplace=True)

In [None]:
df

Unnamed: 0,text
0,"{'eng': 'It was not a ghost refugee camp.', 'l..."
1,{'eng': 'I want to go to town over the weekend...
2,"{'eng': 'I have high blood pressure.', 'lug': ..."
3,{'eng': 'You need to have priorities in life.'...
4,{'eng': 'It's a good practice to help those in...
...,...
23942,{'eng': 'It has all happened in the intervenin...
23943,{'eng': 'Many people have recovered from coron...
23944,{'eng': 'The government will provide support t...
23945,{'eng': 'There are many things that we need to...


In [None]:
import pandas as pd

# Assuming the DataFrame is already loaded and named 'df'

# Create a list to store the transformed data
train_data = []

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    text_dict = row['text']
    for language, text in text_dict.items():
        train_data.append({'Language': language, 'Text': text})

# Create a new DataFrame from the transformed data
train_df = pd.DataFrame(train_data)

# Display the new DataFrame
print(train_df)


       Language                                               Text
0           eng                   It was not a ghost refugee camp.
1           lug  Enkambi y'abanoonyiboobubudamu teyaliiwo mu bu...
2           ach                             Pe obedo kem goba goba
3           teo                      Mam arai ekabi lo erai ekwam.
4           lgg  Eri aa'ni ndra kembe emunyale eyini aa'zu inzo...
...         ...                                                ...
143677      lug  Omukulembeze aliko mweraliikirivu ku muwendo g...
143678      ach  Ngat matye iloc tye ka lworo pi abili ki mwony...
143679      teo  Engarenon lo ipugai ekadakit kanu iyatar enaba...
143680      lgg  Agu ofisi 'da ma alea niri eri asi otiza be ka...
143681      nyn  Omwebembezi oriho obwahati, ayerarikiriire omu...

[143682 rows x 2 columns]


In [None]:
from sklearn.model_selection import train_test_split

X = train_df["Text"]
y = train_df["Language"]

#Split the data for training and testing
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((100577,), (43105,))

In [None]:
train_df[train_df["Language"] == "ach"]

Unnamed: 0,Language,Text
2,ach,Pe obedo kem goba goba
8,ach,Amito citi I taun I tum cabit.
14,ach,Atye ki peko me two pressure
20,ach,Mite ni ibed ki jami ma I mito timone mukwo I ...
26,ach,Obedo tic maber me konyo joo matye I peko
...,...,...
143654,ach,Magi weng otimme I kare me timo gin mo me laro...
143660,ach,Jo mapol gucang ki ki two korona I lobo Uganda.
143666,ach,Gamente obi miyo kom ki joo ma otegi
143672,ach,Tye jami mapol mamyero wange mapud pe wanyomme.


In [None]:
y_train[y_train == "lgg"]

100540    lgg
46198     lgg
93058     lgg
27754     lgg
123730    lgg
         ... 
82798     lgg
35920     lgg
84478     lgg
54886     lgg
131932    lgg
Name: Language, Length: 16613, dtype: object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Create the ensemble model with majority voting
ensemble_model = VotingClassifier(
    estimators=[
        ('SVM', svm_model),
        ('Naive Bayes', naive_bayes_model)
    ],
    voting='hard'  # Use 'hard' for majority voting
)

# Train the ensemble model on the training data
ensemble_model.fit(X_train_tfidf, y_train)

# Predict the language labels for the test set using the ensemble
y_pred = ensemble_model.predict(X_test_tfidf)

# Print the ensemble's performance (assuming you have true labels 'y_test')
print("Ensemble Model:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Ensemble Model:
Accuracy: 0.9966129219348103
              precision    recall  f1-score   support

         ach       1.00      1.00      1.00      7127
         eng       0.99      1.00      1.00      7085
         lgg       1.00      1.00      1.00      7334
         lug       0.99      0.99      0.99      7127
         nyn       1.00      0.99      0.99      7216
         teo       1.00      1.00      1.00      7216

    accuracy                           1.00     43105
   macro avg       1.00      1.00      1.00     43105
weighted avg       1.00      1.00      1.00     43105



In [None]:
from sklearn.svm import SVC
svm_model = SVC(probability=True)  # Enable probability estimation for SVM

# Create the ensemble model with soft voting
ensemble_model = VotingClassifier(
    estimators=[
        ('SVM', svm_model),
        ('Naive Bayes', naive_bayes_model)
    ],
    voting='soft'  # Use 'soft' for soft voting
)

# Train the ensemble model on the training data
ensemble_model.fit(X_train_tfidf, y_train)

# Predict the language labels for the test set using the ensemble
y_pred = ensemble_model.predict(X_test_tfidf)

# Print the ensemble's performance (assuming you have true labels 'y_test')
print("Ensemble Model:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Ensemble Model:
Accuracy: 0.9966593202644705
              precision    recall  f1-score   support

         ach       1.00      1.00      1.00      7127
         eng       1.00      1.00      1.00      7085
         lgg       1.00      1.00      1.00      7334
         lug       0.99      0.99      0.99      7127
         nyn       0.99      0.99      0.99      7216
         teo       1.00      1.00      1.00      7216

    accuracy                           1.00     43105
   macro avg       1.00      1.00      1.00     43105
weighted avg       1.00      1.00      1.00     43105

