# Language Identification in South African Text: Kaggle Competition

## Importing necessary libraries

In [None]:
# Importing necessary libraries! 
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [None]:
!pip install nlppreprocess

## Loading the data

In [None]:
train = pd.read_csv('/kaggle/input/south-african-language-identification-hack-2023/train_set.csv')
test = pd.read_csv('/kaggle/input/south-african-language-identification-hack-2023/test_set.csv')

## Exploratory Data Analysis (EDA)

In [None]:
train.head(5)

In [None]:
train.shape

In [None]:
train.info

In [None]:
train.isnull().sum()

In [None]:
train.duplicated().sum()

In [None]:
train = train.drop_duplicates()

In [None]:
train.shape

In [None]:
train['lang_id'] .unique()

In [None]:
def clean(text):
 text = str(text).lower()
 text = re.sub('\[.*?\]', '', text)
 text = re.sub('https?://\S+|www\.\S+', '', text)
 text = re.sub('<.*?>+', '', text)
 text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
 text = re.sub('\n', '', text)
 text = re.sub('\w*\d\w*', '', text)
 text =" ".join(text)
 return text
train["text"] = train["text"].apply(clean)
test["text"] =test["text"].apply(clean)

In [None]:
train['lang_id'].value_counts()

## Data Preprocessing

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nlppreprocess import NLP
nlp = NLP()
nlp.process('couldnt')

In [None]:
train.head()

In [None]:
#Removing the Stopwords
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(x):
    stopwords = NLP(replace_words=True, remove_stopwords=True, 
                            remove_numbers=True) 
    x = stopwords.process(x)
    return x

In [None]:
train['text'] = train['text'].apply(lambda x:remove_stopwords(x))

In [None]:
#splitting the data 
X = train['text'] 
y = train ['lang_id']

In [None]:
train.head()

## Preprocessing the data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
def preprocess_data(train, test):
    # Initializing the TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', analyzer='char')
    # Fitting the vectorizer on the training data
    vectorizer.fit(train['text'])
    # Transforming the training and test data using the fitted vectorizer
    train_features = vectorizer.transform(train['text'])
    test_features = vectorizer.transform(test['text'])
    return train_features, test_features, vectorizer
# Assuming you have 'train' and 'test' DataFrames
train_features, test_features, vectorizer = preprocess_data(train, test)

## Training and Evaluation

### Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train['lang_id'], test_size=0.2, random_state=42)
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_f1 = f1_score(y_test, lr_preds, average='weighted')

print("Logistic Regression F1 Score:", lr_f1)

### K Nearest Neighbors (KNN)

In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_preds = knn_model.predict(X_test)
knn_f1 = f1_score(y_test, knn_preds, average='weighted')

print("KNN F1 Score:", knn_f1)

### Support Vector Machine

In [None]:
svm = SVC()
svm.fit(X_train, y_train)
svm_predictions = svm.predict(X_test)
svm_f1 = f1_score(y_test, svm_predictions, average='weighted')
print("SVM F1 Score:", svm_f1)

### Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_predictions = nb.predict(X_test)
nb_f1 = f1_score(y_test, nb_predictions, average='weighted')
print("Naive Bayes F1 Score:", nb_f1)

## Generate predictions on the test set

In [None]:
# Converting the test data into TF-IDF vectors
X_test = vectorizer.transform(test['text'])

# Generating predictions on the best performing model
test_predictions = svm.predict(X_test)

## Creating a csv for submission

In [None]:
# Creating a submission dataframe with 'index' and 'lang_id' columns
submission_df = pd.DataFrame({'index': test['index'], 'lang_id': test_predictions})

submission_df.to_csv('submission.csv', index=False)