## Import all the necessary packages

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap
from random import randint
import os
import nltk
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

## Import the data

In [45]:
train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')
test_df.head()
#train_df.head()
#train_df['lang_id'].value_counts()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


## Preprocess the data

In [33]:
X_train, X_val, y_train, y_val = train_test_split(train_df['text'],train_df['lang_id'],test_size = 0.2, random_state = 42)
print(len(X_train))

26400


In [34]:
#
vect = CountVectorizer(lowercase=True, max_features=5000, analyzer='word', ngram_range=(1, 2))
X_vect = vect.fit_transform(X_train.values.astype(str))
X_train = X_vect.toarray()
X_val = vect.transform(X_val.values.astype(str)).toarray()

In [35]:
print(vect.get_feature_names())
X

['aan', 'aan die', 'aan te', 'aansoek', 'aansoek doen', 'aansoeke', 'aanvaar', 'aba', 'ababili', 'abadala', 'abafundi', 'abalimi', 'abalingani', 'abantu', 'abantwana', 'abanye', 'abaphathi', 'abasebenza', 'abasebenzi', 'abazali', 'abe', 'abo', 'about', 'above', 'access', 'access to', 'accordance', 'accordance with', 'account', 'accused', 'act', 'act act', 'act and', 'act no', 'act of', 'action', 'additional', 'address', 'administration', 'af', 'afake', 'afanele', 'afe', 'affairs', 'afho', 'aforika', 'aforika borwa', 'africa', 'african', 'afrika', 'afrika borwa', 'afrika dzonga', 'afrika tshipembe', 'afrikaanse', 'after', 'after the', 'afurika', 'afurika tshipembe', 'aga', 'against', 'agreement', 'aids', 'aka', 'akaretsa', 'akaretša', 'akaretšwa', 'akekho', 'akhawunti', 'akhe', 'akho', 'akhona', 'akukho', 'al', 'al die', 'algemene', 'all', 'alle', 'almal', 'also', 'ama', 'amabili', 'amagadango', 'amagama', 'amahlelo', 'amalanga', 'amalimi', 'amalunga', 'amalungelo', 'amalungu', 'amana',

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [36]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [37]:
nb = MultinomialNB()
nb.fit(X_train,y_train)

MultinomialNB()

In [39]:
nb.score(X_val,y_val)

0.9913636363636363

## Train on full dataset

In [140]:
vect_full = CountVectorizer(lowercase=True, max_features= 160000, analyzer='word', ngram_range=(1, 1))
X_full = vect_full.fit_transform(train_df['text'].values.astype(str)).toarray()

X_test = vect_full.transform(test_df['text'])

In [141]:
le_full = LabelEncoder()
y_full = train_df['lang_id']
y_full = le_full.fit_transform(y_full)

In [142]:
nb = MultinomialNB()
nb.fit(X_full,y_full)

MultinomialNB()

In [143]:
y_pred = nb.predict(X_test)

In [144]:
output = pd.DataFrame(le_full.inverse_transform(y_pred), columns = ['lang_id'])

## Export results

In [145]:
output.insert(0, column = 'index' ,value = test_df['index'].values)

In [146]:
cwd = os.getcwd()
path = cwd + "/Submission21Sept3.csv"
output.to_csv(path, index = False)
output

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot
