# South African Language Identification Hack Challenge

# Import Libriries

In [114]:
import numpy as np
import pandas as pd
import nltk
from nltk import TreebankWordTokenizer, SnowballStemmer, PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re
import urllib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression




In [115]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data

In [116]:
df_train = pd.read_csv('/content/train_set.csv')

In [117]:
df_test = pd.read_csv('/content/test_set.csv')

In [118]:
df_train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [119]:
df_test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


# Clean the Data

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
ps = PorterStemmer()
corpus = []

for i in range(len(df_train['text'])):
    text = re.sub("^[a-zA-Z]",' ', df_train['text'][i])
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if set(stopwords.words())]
    text = ' '.join(text)
    corpus.append(text)


# Convert text into Vector

In [12]:
cv = CountVectorizer(max_features=10000)
X = cv.fit_transform(corpus).toarray()

In [13]:
X.shape

(4153, 10000)

# LabelEncoding - Convert the Language names into labels

In [14]:
label = LabelEncoder()
y = label.fit_transform(df_train['lang_id'])


In [17]:
y

array([9, 9, 1, ..., 7, 2, 6])

In [18]:
len(y)

4153

In [19]:
label.classes_

array(['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven',
       'xho', 'zul'], dtype=object)

In [20]:
df_train1 = pd.DataFrame(np.c_[corpus,y],columns=['Text1','lang_codes'])

In [22]:
df_train1.tail()

Unnamed: 0,Text1,lang_codes
4148,kuhlahlutjwa kwetheknoloji yokutjhugulula okwe...,2
4149,am assum that the focu of thi confer is on gro...,1
4150,oloby a nga swi kota ku loko a rhumisa hi ku n...,7
4151,kutjheja nokwenza iimphakamiso ngokubika okune...,2
4152,hoto mohna jane mcpherso,6


In [23]:
df_train1['lang_codes'].unique()

array(['9', '1', '3', '8', '6', '2', '10', '5', '7', '4', '0'],
      dtype=object)

## Split the data into training datset and Validation dataset

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model creation

# Model No1 - Naive Bayes

In [26]:
MNB = MultinomialNB().fit(X_train,y_train)

In [27]:
y_pred = MNB.predict(X_test)

## Evaluate Model No1

In [28]:
score_MNB = accuracy_score(y_test,y_pred)

In [29]:
score_MNB

0.9963898916967509

In [47]:
cr = classification_report(y_test, y_pred)

In [48]:
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        83
           1       1.00      1.00      1.00        75
           2       1.00      0.96      0.98        76
           3       1.00      1.00      1.00        72
           4       1.00      1.00      1.00        77
           5       1.00      1.00      1.00        70
           6       1.00      1.00      1.00        81
           7       1.00      1.00      1.00        79
           8       1.00      1.00      1.00        72
           9       0.99      1.00      0.99        75
          10       0.97      1.00      0.99        71

    accuracy                           1.00       831
   macro avg       1.00      1.00      1.00       831
weighted avg       1.00      1.00      1.00       831



# Model No2 - Linear regression

In [30]:
lr = LinearRegression() 

In [31]:
# train data and predict using linear regression model
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

## Evaluate Model No2

In [32]:
score_lr = accuracy_score(y_test,y_pred)

In [33]:
score_lr

0.9963898916967509

# Preparing for Submission

In [34]:
df_test1 = df_test

In [83]:
df_test1

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
...,...,...
5677,5678,You mark your ballot in private.
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ..."
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ..."


# Save Model

In [71]:
import joblib
joblib.dump(MNB, "sa_language_identifier.sav")

['sa_language_identifier.sav']

# Load Model

In [77]:
model = joblib.load('sa_language_identifier.sav')

# Clean the test data and reverse the label

In [105]:
def test_model(sentence):
  lang = {
  'afr': 0,
  'eng': 1,
  'nbl': 2,
  'nso': 3,
  'sot': 4,
  'ssw': 5, 
  'tsn': 6,
  'tso': 7,
  'ven': 8,
  'xho': 9,
  'zul': 10   
  }

  text = re.sub("^[a-zA-Z]",' ', str(sentence))
  text = text.lower()
  text = text.split()
  text = [ps.stem(word) for word in text if word not in set(stopwords.words())]
  text = ' '.join(text)

  text = cv.transform([text]).toarray()
  pred = model.predict(text)[0]

  keys = list(lang)
  values = list(lang.values())
  position = values.index(pred)
  pred = keys[position]
  #print(pred)
  return pred



In [106]:
test_model('Kube inja nelikati betingevakala kutsi titsini..')

'ssw'

# Kaggle Submission

In [107]:
df_test['lang_id'] = df_test['text'].apply(test_model)

In [108]:
df_test['lang_id'].head()

0    tsn
1    nbl
2    ven
3    ssw
4    afr
Name: lang_id, dtype: object

In [110]:
submission = df_test[['index', 'lang_id']]

In [112]:
submission.to_csv('Nobuntu_submission_A.csv', index=False)

In [113]:
submission.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
