# SOUTH AFRICAN LANGUAGE IDENTIFICATION

In [60]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
# Models
from sklearn.svm import LinearSVC, SVC
from nlppreprocess import NLP
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

### Loading the Datasets

In [3]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [4]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [7]:
train.shape

(33000, 2)

#### Data Pre-processing

In [8]:
train.lang_id.describe()

count     33000
unique       11
top         xho
freq       3000
Name: lang_id, dtype: object

In [10]:
#Character Length
train['length'] = train['text'].apply(lambda x: len(x))

In [11]:
train.head()

Unnamed: 0,lang_id,text,length
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,220
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,252
2,eng,the province of kwazulu-natal department of tr...,264
3,nso,o netefatša gore o ba file dilo ka moka tše le...,217
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,239


In [12]:
#Remove Punctuations
def _remove_punc(txt):
  
    txt = re.sub(r'[-]',' ',txt)
    txt = re.sub(r'[_]', ' ', txt)
    txt = re.sub(r'[^\w\s]','',txt)
    txt = re.sub('[0-9]+', '', txt)
    txt = re.sub(r'[^\x00-\x7f]',r'', txt)
    return txt
#Apply the function to the dataset
train['clean_txt'] = train['text'].apply(_remove_punc)

In [13]:
def _lower(x):
    return x.lower()
train['lower'] = train['clean_txt'].apply(_lower)

In [14]:
train.head()

Unnamed: 0,lang_id,text,length,clean_txt,lower
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,220,umgaqo siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,252,i dha iya kuba nobulumko bokubeka umsebenzi na...,i dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...,264,the province of kwazulu natal department of tr...,the province of kwazulu natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,217,o netefata gore o ba file dilo ka moka te le d...,o netefata gore o ba file dilo ka moka te le d...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,239,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [18]:
#Remove Stopwords
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(x):
   
    stopwords = NLP(replace_words=True, remove_stopwords=True, 
                            remove_numbers=True, remove_punctuations=False) 
    x = stopwords.process(x)
    return x
    
train['Text_nonstop'] = train['lower'].apply(lambda x: remove_stopwords(x))

In [19]:
train.head()

Unnamed: 0,lang_id,text,length,clean_txt,lower,Text_nonstop
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,220,umgaqo siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,252,i dha iya kuba nobulumko bokubeka umsebenzi na...,i dha iya kuba nobulumko bokubeka umsebenzi na...,dha iya kuba nobulumko bokubeka umsebenzi naph...
2,eng,the province of kwazulu-natal department of tr...,264,the province of kwazulu natal department of tr...,the province of kwazulu natal department of tr...,province kwazulu natal department transport in...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,217,o netefata gore o ba file dilo ka moka te le d...,o netefata gore o ba file dilo ka moka te le d...,o netefata gore o ba file dilo ka moka te le d...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,239,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [20]:
def _analyzer (x):
    x = _remove_punc(x)
    x = _lower(x)
    x = remove_stopwords(x)
    return x

In [21]:
train['cleaned'] = train['text'].apply(_analyzer)
test['cleaned'] = test['text'].apply(_analyzer)

In [22]:
test.head()

Unnamed: 0,index,text,cleaned
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",mmasepala fa maemo kgethegileng letlelela kgat...
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,tshivhumbeo tshi fana na ngano dza vhathu
3,4,Kube inja nelikati betingevakala kutsi titsini...,kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.,winste op buitelandse valuta


In [23]:
train.head()

Unnamed: 0,lang_id,text,length,clean_txt,lower,Text_nonstop,cleaned
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,220,umgaqo siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,252,i dha iya kuba nobulumko bokubeka umsebenzi na...,i dha iya kuba nobulumko bokubeka umsebenzi na...,dha iya kuba nobulumko bokubeka umsebenzi naph...,dha iya kuba nobulumko bokubeka umsebenzi naph...
2,eng,the province of kwazulu-natal department of tr...,264,the province of kwazulu natal department of tr...,the province of kwazulu natal department of tr...,province kwazulu natal department transport in...,province kwazulu natal department transport in...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,217,o netefata gore o ba file dilo ka moka te le d...,o netefata gore o ba file dilo ka moka te le d...,o netefata gore o ba file dilo ka moka te le d...,o netefata gore o ba file dilo ka moka te le d...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,239,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


### Feature Engineering

In [24]:
# Splitting  X (indepedent) and Y (target/dependent) variables
X = train['cleaned']
y = train['lang_id']

#### Splitting the data

In [26]:
X_train , X_test , y_train , y_test = train_test_split(X , y, stratify=y,
                                                       test_size =0.4, 
                                                       random_state=42)

In [27]:
lg = [LogisticRegression(random_state =42 , max_iter=5000) , 
       MultinomialNB(), LinearSVC(random_state=42), 
       SGDClassifier(random_state=42), RidgeClassifier(random_state=42)]

In [32]:
def _performace_assesment(*args , **kwargs):
  model_stats = {}
  for clf in lg:
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')),
                      ('clf' , clf)
                      ])
    
    model.fit(X_train, y_train) #Training
    model_pred = model.predict(X_test) #Testing

    # Dictionary of Models Performances
    model_stats[clf.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, model_pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, model_pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, model_pred, average='weighted')}
  return pd.DataFrame.from_dict(model_stats, orient='index')

In [33]:
performance = _performace_assesment(lg , X_train , X_test , y_train , y_test)
performance.to_csv('performance.csv')
dataframe = pd.read_csv('performance.csv', index_col = 0)
dataframe.sort_values('F1-Weighted', ascending=False)



Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
MultinomialNB,0.999394,0.999394,0.999394
RidgeClassifier,0.999167,0.999167,0.999167
LinearSVC,0.999167,0.999167,0.999167
SGDClassifier,0.999091,0.999091,0.999091
LogisticRegression,0.998031,0.99803,0.998031


### Hyper Tunning

In [44]:
def _param_tuning(*args , **kwargs):
  best_params = {}

  for clf in lg:
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')),
                      ('clf' , clf)])
    model.fit(X_train, y_train) #Training
    
    #Get models performing parameters
    params = model.get_params()
    model_name = clf.__class__.__name__ 
    model_name = {}
    for key in params:
      if key.startswith("clf"):
        if len(key) < 5:
          model_name['model'] = params[key]
        else:
            model_name[key[5:]] = params[key]
    best_params[clf.__class__.__name__] = model_name
  return best_params

In [45]:
best_params = _param_tuning(lg, X_train, y_train)



In [46]:
#Best parameters
best_params

{'LogisticRegression': {'model': LogisticRegression(max_iter=5000, random_state=42),
  'C': 1.0,
  'class_weight': None,
  'dual': False,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'l1_ratio': None,
  'max_iter': 5000,
  'multi_class': 'auto',
  'n_jobs': None,
  'penalty': 'l2',
  'random_state': 42,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'verbose': 0,
  'warm_start': False},
 'MultinomialNB': {'model': MultinomialNB(),
  'alpha': 1.0,
  'class_prior': None,
  'fit_prior': True,
  'force_alpha': True},
 'LinearSVC': {'model': LinearSVC(random_state=42),
  'C': 1.0,
  'class_weight': None,
  'dual': 'warn',
  'fit_intercept': True,
  'intercept_scaling': 1,
  'loss': 'squared_hinge',
  'max_iter': 1000,
  'multi_class': 'ovr',
  'penalty': 'l2',
  'random_state': 42,
  'tol': 0.0001,
  'verbose': 0},
 'SGDClassifier': {'model': SGDClassifier(random_state=42),
  'alpha': 0.0001,
  'average': False,
  'class_weight': None,
  'early_stopping': False,
  'epsilon': 0.1,
  'eta0':

### GridSearch

In [47]:
#model
model1 = MultinomialNB()

In [48]:
Vectorize = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')
X_train = Vectorize.fit_transform(X_train)
X_test = Vectorize.transform(X_test)



In [49]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True,
                                   random_state=42)

In [50]:
best_params[lg[1].__class__.__name__]

{'model': MultinomialNB(),
 'alpha': 1.0,
 'class_prior': None,
 'fit_prior': True,
 'force_alpha': True}

In [52]:
alpha = list(np.linspace(0.1,0.02,4))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator= model1,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

alpha = list(np.linspace(0.1,0.02,4))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator= model1,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [53]:
grid_search.fit(X_train, y_train)
prediction = grid_search.predict(X_test)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

In [54]:
print(f'Cross-validation score: {cv_score}')
print(f'Test score: {test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Cross-validation score: 0.9996969686252897
Test score: 0.9995455174533855


### Applying Ridge Regression

In [55]:
#model
model2 = RidgeClassifier()

In [56]:
best_params[lg[4].__class__.__name__]

{'model': RidgeClassifier(random_state=42),
 'alpha': 1.0,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'positive': False,
 'random_state': 42,
 'solver': 'auto',
 'tol': 0.0001}

In [57]:
alpha = list(np.linspace(0.15,0.4, 5))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator= model2,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [58]:
grid_search.fit(X_train, y_train)
prediction = grid_search.predict(X_test)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

In [59]:
print(f'Cross-validation score: {cv_score}')
print(f'Test score: {test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Cross-validation score: 0.9994949477404799
Test score: 0.9990909717224429


In [76]:
X_train , X_test , y_train , y_test = train_test_split(X, y,  stratify=y, test_size=0.4, random_state =1)

ValueError: Found input variables with inconsistent numbers of samples: [5682, 33000]

In [77]:
vect = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(2, 6), analyzer= 'char')
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)



AttributeError: lower not found

In [63]:
multiNB1 = MultinomialNB(alpha=0.1)
multiNB2 = MultinomialNB(alpha=0.1)

estimators = [('multiNB1', multiNB1), ('multiNB2', multiNB2)]
final_est = RidgeClassifier(alpha=0.2125)

In [64]:
stacking_NB2 = StackingClassifier(estimators = estimators,
                           final_estimator = final_est,
                           passthrough = True)


In [65]:
stacking_NB2.fit(X_train , y_train)

In [67]:
pred = stacking_NB2.predict(X_test)


In [68]:
model_stats = {}
model_stats[stacking_NB2.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, pred, average='weighted')}
pd.DataFrame.from_dict(model_stats, orient='index')

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
StackingClassifier,0.999773,0.999773,0.999773


#### Kaggle Submission

In [72]:
X = test['cleaned']
Vectorize = vect.transform(X)

In [73]:
test['lang_id'] = stacking_NB2.predict(Vectorize)

In [74]:
submission = test[['index', 'lang_id']]
submission.to_csv('Submission.csv',index=False)
submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot
