### **1. Dataset loading**


In [None]:
import csv
import pandas as pd

In [None]:
file_path_train = "../data/train_submission_1.csv"
file_path_test = "../data/test_without_labels_1.csv"

data_train = pd.read_csv(file_path_train)

### **2. Train dataset analysis**


**2.1 Non-labeled data suppression**

In [None]:

data_train_without_nan_for_label = data_train.dropna() # suppression des données non labellisées

**2.2 Train dataset preprocessing**

In [None]:
import unicodedata
from collections import defaultdict
import string
import re 
import unicodedata
from tqdm import tqdm
import numpy as np 


In [None]:
def cleaning(text): 
    """
    Fonction pour pré-traiter le texte en enlevant tous les éléments de ponctuation, les chiffres, les double espaces, les URL etc. 
    """

    if not isinstance(text, str):
        return ""

    text = re.sub(r"https?://[^\s]+|www\.[^\s]+", " ", text) # Delete URLs
    text = re.sub(r"\d+", " ", text)  # Delete isolated numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  
    text = ''.join(c for c in text if not (unicodedata.category(c).startswith('P') or unicodedata.category(c) in ['No']))
    text = re.sub(r'[\-\u2010-\u2015]', '', text) # Delete all types of dashes
    text = re.sub(r"['\"‘’‚‛“”„‟‹›«»⹂⹃⸂⸃⸄⸅⸉⸊「」『』〝〞＂ˮ`´ʹʺʻʼʽʾʿˈˊˋ]", '', text)
    text = re.sub(r"\s+", " ", text).strip() # Delete double spaces

    text_cleaned = text.lower() 

    return(text_cleaned)


Multi-alphabet management for every given label

In [None]:
SCRIPT_MAP = {
    "LATIN": "Latin",
    "CYRILLIC": "Cyrillique",
    "ARABIC": "Arabe",
    "HEBREW": "Hébreu",
    "GREEK": "Grec",
    "DEVANAGARI": "Devanagari (Hindi, Sanskrit)",
    "HIRAGANA": "Hiragana (Japonais)",
    "KATAKANA": "Katakana (Japonais)",
    "CJK": "Kanji (Chinois, Japonais, Coréen)",
    "HANGUL": "Hangul (Coréen)",
    "THAI": "Thaï",
    "ARMENIAN": "Arménien",
    "GEORGIAN": "Géorgien",
    "ETHIOPIC": "Éthiopien",
    "TAMIL": "Tamoul",
    "BENGALI": "Bengali",
    "TELUGU": "Télougou",
}

In [None]:
def count_alphabet_characters(text):
    script_counts = defaultdict(int)

    for char in text:
        if char.isalpha():  
            try:
                char_name = unicodedata.name(char) 
                script_key = char_name.split()[0]  # Take the first word of Unicode character
                
                if "CJK" in char_name:
                    script_key = "CJK"  # Special case for kanjis
                
                script_name = SCRIPT_MAP.get(script_key, script_key)
                script_counts[script_name] += 1  
                
            except ValueError:
                continue  # If char does not have Unicode name
    
    return dict(script_counts) 


def most_frequent_script(text):
    script_counts = count_alphabet_characters(text) 
    
    if script_counts:  
        most_common_script = max(script_counts.items(), key=lambda x: x[1])  
        return most_common_script  
    else:
        return None  

def add_alphabet_to_label(df):
    for index, row in tqdm(df.iterrows(), total=len(df)): 
        alphabet_most_frequent = most_frequent_script(row['Text']) 
        
        if alphabet_most_frequent:  
            df.at[index, 'Label'] = f"{row['Label']}_{alphabet_most_frequent[0]}" 
    
    return df


**2.3 Final preprocessing function definition**

In [None]:
 
def pre_processing(df, remove_espace = True, need_to_clean = True): 
    """
    Utilisation des méthodes de pré-traitement définies auparavant pour rendre le texte propre. 
    """

    if need_to_clean: 
        df['Text'] = df['Text'].apply(cleaning)
    
    if remove_espace: 
        df['Text'] = df['Text'].str.replace(' ', '', regex=False)

    df['Text'] = df['Text'].apply(add_alphabet_to_label)
    
    return df

**2.4 Util functions definition to diplay results**

In [None]:
def restore_original_label(label):
    
    return label.split("_")[0]  

def restore_labels(liste):
    new_liste = []
    for element in tqdm(liste): 
        new_liste.append(restore_original_label(element))

    return np.array(new_liste)

### **3. First classification approach**
- TF-IDF vectorizer
- MultinomialNB as a classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
tqdm.pandas()

In [None]:
train_set, val_set = train_test_split(data_train_without_nan_for_label, test_size=0.1, random_state=42)

train_set_first_version = train_set.copy()
val_set_first_version = val_set.copy()

train_set_first_version = pre_processing(train_set_first_version, remove_espace=False) 
val_set_first_version = pre_processing(val_set_first_version, remove_espace=False)

100%|██████████| 171089/171089 [00:06<00:00, 26524.87it/s]
100%|██████████| 19010/19010 [00:00<00:00, 25265.55it/s]


**3.1 Model training**

In [None]:
val_set_first_version.head(5)

Unnamed: 0,Usage,Text,Label
128184,Public,apărând din față întrun exercițiu apăsând rolu...,ron_Latin
95049,Public,sa kaya vua ko jisu sa volai talega mo kakua n...,fij_Latin
170377,Public,אין צוואנציק יאר האט דאס דארף פארוואנדלט אין א...,yid_Hébreu
171119,Public,seniň sözüňe gulak assa bu doganyňy gazandygyň...,tuk_Latin
62238,Public,ala sma fu grontapu o kon na en fesi dan a o p...,srn_Latin


In [None]:
x_train = train_set_first_version['Text'].tolist()
y_train = train_set_first_version['Label'].tolist()
x_val = val_set_first_version['Text'].tolist()
y_val = val_set_first_version['Label'].tolist()
y_total = y_train + y_val

# Converting categorical variables to numerical
le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_val = le.transform(y_val)
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))

In [None]:
# Pipeline with progression bars
pipe = Pipeline([
        ('tfidf', TfidfVectorizer(analyzer="char", ngram_range=(1, 5), max_features=200000)),
    ('classifier', MultinomialNB(alpha=0.001, fit_prior=False))
])

# Usage 
pipe.fit(x_train, y_train)

**3.2 Evaluation**

In [None]:
from sklearn.metrics import accuracy_score, classification_report

Computing accuracy for out tuned labels

In [None]:
predictions = pipe.predict(x_val)
accuracy = accuracy_score(y_val, predictions)
print("Raw predictions accuracy:", accuracy)

Accuracy: 0.8514466070489216


Restore original labels and compute final accuracy

In [None]:
predicted_labels = le.inverse_transform(predictions)
labels_predict = le.inverse_transform(y_val)

final_pred = restore_labels(predicted_labels)
val_predict = restore_labels(labels_predict)
final_accuracy = accuracy_score(val_predict, final_pred)

print("Final accuracy:", final_accuracy)

Results analysis

In [None]:
# Generate the report with class indices
present_classes = np.unique(np.concatenate((y_val, predictions)))
filtered_target_names = [le.classes_[i] for i in present_classes]
report = classification_report(y_val, predictions, target_names=filtered_target_names, output_dict=True)

# Filter out non-class entries and sort ('accuracy', 'macro avg', 'weighted avg')
filtered_report = {label: metrics for label, metrics in report.items() if isinstance(metrics, dict)}
sorted_report = sorted(filtered_report.items(), key=lambda x: x[1]['f1-score'], reverse=True)

print("Classification Report:\n")
for label, metrics in sorted_report:
    print(f"{label}: f1 = {metrics['f1-score']:.2f}, Pre = {metrics['precision']:.2f}, Rec = {metrics['recall']:.2f}, Support = {metrics['support']:0f}")


Classification Report (trié par F1-score décroissant):

abk_Cyrillique: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 49.0
ahk_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 48.0
alt_Cyrillique: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 48.0
aoj_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 45.0
arn_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 63.0
asm_Bengali: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 44.0
bem_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 38.0
bpy_Bengali: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 48.0
bqc_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 45.0
bzj_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 43.0
cab_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 45.0
cak_Latin: F1-score = 1.0000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Extract labels that have lower than 0.5 F1-score
labels_to_improve = {label: metrics for label, metrics in filtered_report.items() if metrics['f1-score'] < 0.5}
labels_to_improve

{np.str_('acm_Arabe'): {'precision': 0.5806451612903226,
  'recall': 0.32727272727272727,
  'f1-score': 0.4186046511627907,
  'support': 55.0},
 np.str_('afb_Arabe'): {'precision': 0.34210526315789475,
  'recall': 0.2708333333333333,
  'f1-score': 0.3023255813953488,
  'support': 48.0},
 np.str_('ajp_Arabe'): {'precision': 0.3090909090909091,
  'recall': 0.4857142857142857,
  'f1-score': 0.37777777777777777,
  'support': 35.0},
 np.str_('als_Latin'): {'precision': 0.5652173913043478,
  'recall': 0.325,
  'f1-score': 0.4126984126984127,
  'support': 40.0},
 np.str_('apc_Arabe'): {'precision': 0.4153846153846154,
  'recall': 0.5192307692307693,
  'f1-score': 0.46153846153846156,
  'support': 52.0},
 np.str_('ara_Arabe'): {'precision': 0.3486238532110092,
  'recall': 0.7037037037037037,
  'f1-score': 0.4662576687116564,
  'support': 54.0},
 np.str_('ary_Arabe'): {'precision': 0.47058823529411764,
  'recall': 0.4444444444444444,
  'f1-score': 0.45714285714285713,
  'support': 54.0},
 np.st

### **4. Second approach**
- SentencePiece as a tokenizer
- TF-IDF vectorizer
- MultinomialNB as a classifier


**4.1 SentencePiece training**

In [None]:
import sentencepiece as spm

Raw txt file generation to train SentencePiece

In [None]:
corpus_path = "corpus_multilingue.txt" 
data_train_preprocessed_for_corpus = data_train.copy()
data_train_preprocessed_for_corpus = pre_processing(data_train_preprocessed_for_corpus, remove_espace=False)
data_train_preprocessed_for_corpus["Text"].dropna().to_csv(corpus_path, index=False, header=False, sep="\n")

print(f"Saved corpus : {corpus_path}, with {len(data_train)} sentences.")

Corpus enregistré : corpus_multilingue.txt, avec 190599 phrases.


Model loading and training

In [None]:
spm.SentencePieceTrainer.Train(
    input='./corpus_multilingue.txt',  
    model_prefix='sp_model',
    vocab_size=60000,  
    character_coverage=1.0,  
    model_type='unigram'  
)

sp = spm.SentencePieceProcessor(model_file='sp_model.model')

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./corpus_multilingue.txt
  input_format: 
  model_prefix: sp_model
  model_type: UNIGRAM
  vocab_size: 60000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0

**4.2 Preprocessing applying tokenization**

In [None]:
def sentencepiece_tokenize(text):
    """Tokenise un texte en sous-mots avec SentencePiece"""
    
    return ' '.join(sp.encode(text, out_type=str))

In [None]:
train_set_second_version = train_set.copy()
val_set_second_version = val_set.copy()

train_set_second_version = add_alphabet_to_label(train_set_second_version)
val_set_second_version = add_alphabet_to_label(val_set_second_version)

train_set_second_version['Text'] = train_set_second_version['Text'].progress_apply(sentencepiece_tokenize)
val_set_second_version['Text'] = val_set_second_version['Text'].progress_apply(sentencepiece_tokenize)

**4.3 Training**\
\
Same training as first approach

In [None]:
vectorizer_sp = TfidfVectorizer(analyzer="char", ngram_range=(1, 4), max_features=200000)
naive_bayes_sp = MultinomialNB(alpha= 0.001, fit_prior = False) 

pipeline = Pipeline([
    ('tfidf', vectorizer_sp),
    ('mnb', naive_bayes_sp)
])

x_train_sp = train_set_second_version['Text'].tolist()
y_train_sp = train_set_second_version['Label'].tolist()
x_val_sp = val_set_second_version['Text'].tolist()
y_val_sp = val_set_second_version['Label'].tolist()
y_total_sp = y_train_sp + y_val_sp

# converting categorical variables to numerical
from sklearn.preprocessing import LabelEncoder
le_sp = LabelEncoder()
le_sp.fit(y_total_sp)

y_train_sp = le_sp.transform(y_train_sp)
y_val_sp = le_sp.transform(y_val_sp)
label_mapping = dict(zip(le_sp.classes_, range(len(le_sp.classes_))))


pipeline.fit(x_train_sp, y_train_sp)

**4.4 Predictions**

In [None]:
predictions_sp = pipeline.predict(x_val_sp)
accuracy_sp = accuracy_score(y_val_sp, predictions_sp)
print("Raw accuracy:", accuracy_sp)

predicted_labels_sp = le_sp.inverse_transform(predictions_sp)
labels_to_predict = le_sp.inverse_transform(y_val_sp)

final_prediction = restore_labels(predicted_labels_sp)
val_to_predict = restore_labels(labels_to_predict)
final_accuracy = accuracy_score(val_to_predict, final_prediction)

print("Final accuracy:", final_accuracy)

Accuracy: 0.8531825355076276


**4.5 Results analysis**

In [None]:
# Generate the report with class indices
present_classes_sp = np.unique(np.concatenate((y_val_sp, predictions_sp)))
filtered_target_names_sp = [le_sp.classes_[i] for i in present_classes_sp]
report_sp = classification_report(y_val_sp, predictions_sp, target_names=filtered_target_names_sp, output_dict=True)

# Filter out non-class entries and sort ('accuracy', 'macro avg', 'weighted avg')
filtered_report = {label: metrics for label, metrics in report_sp.items() if isinstance(metrics, dict)}
sorted_report = sorted(filtered_report.items(), key=lambda x: x[1]['f1-score'], reverse=True)


print("Classification Report:\n")
for label, metrics in sorted_report:
    print(f"{label}: f1 = {metrics['f1-score']:.2f}, Pre = {metrics['precision']:.2f}, Rec = {metrics['recall']:.2f}, Support = {metrics['support']:0f}")


Classification Report (trié par F1-score décroissant):

abk_Cyrillique: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 49.0
ach_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 8.0
ahk_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 48.0
alt_Cyrillique: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 48.0
aoj_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 45.0
arn_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 63.0
asm_Bengali: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 44.0
bem_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 38.0
bpy_Bengali: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 48.0
bqc_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 45.0
bzj_Latin: F1-score = 1.0000, Precision = 1.0000, Recall = 1.0000, Support = 43.0
cab_Latin: F1-score = 1.0000,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
val_set_second_version[val_set_second_version['Label'] == 'bos_Latin'].head()

Unnamed: 0,Usage,Text,Label
34789,Public,▁ B. ▁ 2 4SI ▁ - ▁ I stra ži vanje ▁od s je ka...,bos_Latin
187584,Public,▁ I ▁svak og ▁dana ▁na kon ▁to ga ▁je ▁laga no...,bos_Latin
172443,Public,▁ S lu ša j ! ▁ G de ▁je ▁čo ve k ▁koji ▁je ▁p...,bos_Latin
1390,Public,▁ E ki pa ▁ J a pan ske ▁se ▁sa sto ja la ▁od ...,bos_Latin
49647,Public,▁ Ž ene ▁će ▁pu ca ti ▁ako ▁su ▁iz gu bile ▁si...,bos_Latin
184506,Public,▁ “ C ije na ▁ide ▁za ▁jedn u ▁mar ku ▁gore .,bos_Latin
101390,Public,▁ B a sic ▁je ▁program ski ▁je zik . ▁ P r va ...,bos_Latin
38189,Public,"▁ ""R ef ere ndum ▁kao ▁i ▁svak o ▁drug o ▁iz r...",bos_Latin
90005,Public,"▁ ""R o dos lo vi ▁i ▁istori ja ▁ve lik ih ▁ku ...",bos_Latin
158661,Public,▁ Ž el iš ▁me ▁pita ti ▁mo žeš ▁li ▁do bi ti ▁...,bos_Latin


### **5. GridSearch to choose the best model**

- The use of GridSearch is highly resource-intensive.
- Therefore, we reduce the dimensionality of the vectors (max_features in TF-IDF) to 2000 and select only models with reasonable complexity for 𝑛≈190,000 and 𝑑≈2000
- This provides a proxy for the best model-hyperparameter combination among these lightweight options.
- We do not consider adding the alphabet here, as it creates issues with excessively small classes in cross-validation.

In [None]:
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

Removing extremely underrepresented languages (<10 instances)

In [None]:
dataset_sorted_by_number_instances_by_language = data_train_without_nan_for_label.groupby("Label").count().sort_values('Usage', ascending=True)
dataset_sorted_by_number_instances_by_language.head()
df = dataset_sorted_by_number_instances_by_language
underrepresented_languages = list(df[df['Text']<10].index)
data_train_without_nan_for_label = data_train_without_nan_for_label[~data_train_without_nan_for_label.isin(underrepresented_languages)]

In [None]:
train_set, val_set = train_test_split(data_train_without_nan_for_label, test_size=0.1, random_state=42)

train_set_third_version = train_set.copy()
val_set_third_version = val_set.copy()

train_set_third_version = pre_processing(train_set_third_version, remove_espace=False, not_test=False) 
val_set_third_version = pre_processing(val_set_third_version, remove_espace=False, not_test=False)

In [None]:
x_train = train_set_third_version['Text'].tolist()
y_train = train_set_third_version['Label'].tolist()
x_val = val_set_third_version['Text'].tolist()
y_val = val_set_third_version['Label'].tolist()
y_total = y_train + y_val

le = LabelEncoder()
le.fit(y_total)

y_train = le.transform(y_train)
y_val = le.transform(y_val)
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))

In [None]:
param_grid = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.001, 0.01, 0.1, 1.0],
        'classifier__fit_prior': [True, False]
    },
    {
        'classifier': [SGDClassifier(max_iter=1000, tol=1e-3)],
        'classifier__loss': ['log_loss'],  # Régression logistique
        'classifier__alpha': [1e-4, 1e-3, 1e-2],
        'classifier__penalty': ['l2']
    }
] 

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer="char", ngram_range=(1, 5), max_features=2000)), # Max_features to 2000 to reduce computing time
    ('classifier', MultinomialNB()) 
])

grid_search = GridSearchCV(pipeline, param_grid, cv=2, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(x_train, y_train)

best_model = grid_search.best_estimator_
print(f"Best model : {best_model}")

predictions = best_model.predict(x_val)
accuracy = accuracy_score(y_val, predictions)

print("Best hyperparameters:", grid_search.best_params_)
print("Accuracy on validation set:", accuracy)

Fitting 2 folds for each of 11 candidates, totalling 22 fits
[CV] END classifier=MultinomialNB(), classifier__alpha=0.01, classifier__fit_prior=True; total time= 3.7min
[CV] END classifier=MultinomialNB(), classifier__alpha=0.001, classifier__fit_prior=False; total time= 3.7min
[CV] END classifier=MultinomialNB(), classifier__alpha=0.01, classifier__fit_prior=False; total time= 3.8min
[CV] END classifier=MultinomialNB(), classifier__alpha=0.001, classifier__fit_prior=False; total time= 3.8min
[CV] END classifier=MultinomialNB(), classifier__alpha=0.01, classifier__fit_prior=False; total time= 3.8min
[CV] END classifier=MultinomialNB(), classifier__alpha=0.001, classifier__fit_prior=True; total time= 3.8min
[CV] END classifier=MultinomialNB(), classifier__alpha=0.01, classifier__fit_prior=True; total time= 3.8min
[CV] END classifier=MultinomialNB(), classifier__alpha=0.001, classifier__fit_prior=True; total time= 3.8min
[CV] END classifier=MultinomialNB(), classifier__alpha=0.1, classif

In [None]:
pd.set_option('display.max_colwidth', None) 

results_df = pd.DataFrame(grid_search.cv_results_)
accuracy_results = results_df[['params', 'mean_test_score', 'std_test_score']]
accuracy_results = accuracy_results.sort_values(by='mean_test_score', ascending=False)
accuracy_results.head()

accuracy_results.to_csv("grid_search_results.csv", index=False)

Unnamed: 0,params,mean_test_score,std_test_score
2,"{'classifier': MultinomialNB(), 'classifier__alpha': 0.01, 'classifier__fit_prior': True}",0.744373,0.00226
4,"{'classifier': MultinomialNB(), 'classifier__alpha': 0.1, 'classifier__fit_prior': True}",0.742877,0.002506
0,"{'classifier': MultinomialNB(), 'classifier__alpha': 0.001, 'classifier__fit_prior': True}",0.742239,0.001927
3,"{'classifier': MultinomialNB(), 'classifier__alpha': 0.01, 'classifier__fit_prior': False}",0.74214,0.002646
5,"{'classifier': MultinomialNB(), 'classifier__alpha': 0.1, 'classifier__fit_prior': False}",0.740346,0.00236
1,"{'classifier': MultinomialNB(), 'classifier__alpha': 0.001, 'classifier__fit_prior': False}",0.739299,0.001805
6,"{'classifier': MultinomialNB(), 'classifier__alpha': 1.0, 'classifier__fit_prior': True}",0.732058,0.002114
7,"{'classifier': MultinomialNB(), 'classifier__alpha': 1.0, 'classifier__fit_prior': False}",0.726651,0.002599
8,"{'classifier': SGDClassifier(), 'classifier__alpha': 0.0001, 'classifier__loss': 'log_loss', 'classifier__penalty': 'l2'}",0.717007,0.001875
9,"{'classifier': SGDClassifier(), 'classifier__alpha': 0.001, 'classifier__loss': 'log_loss', 'classifier__penalty': 'l2'}",0.644156,0.005147


In [94]:
accuracy_results.to_csv("grid_search_results.csv", index=False)

### **6. Submission**

In [None]:
train = data_train_without_nan_for_label.copy()
train = pre_processing(train, remove_espace=False)

x = train['Text'].tolist()
y = train['Label'].tolist()

vectorizer= TfidfVectorizer(analyzer="char", ngram_range=(1, 5), max_features=200000)
best_model = grid_search.best_estimator_['classifier']

best_pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('classifier', best_model)
])

le_test = LabelEncoder()
y = le_test.fit_transform(y)
label_mapping_test = dict(zip(le_test.classes_, range(len(le_test.classes_))))

best_pipeline.fit(x, y)

100%|██████████| 190099/190099 [00:07<00:00, 26369.50it/s]


In [None]:

data_test= pd.read_csv(file_path_test)
test_set = pre_processing(data_test, remove_espace=False, not_test=False)
x_test = test_set['Text'].tolist()

predictions_test = best_pipeline.predict(x_test)
predicted_labels_test = le_test.inverse_transform(predictions_test)
predicted_labels_test = restore_labels(predicted_labels_test)
test_set['Label'] = predicted_labels_test

column_ID = [i for i in range(1, len(test_set)+1)]
test_set['ID'] = column_ID

test_set[['ID', 'Label']].to_csv('test_set_v9_sans_tokenizer_predicted.csv', index=False)

100%|██████████| 190567/190567 [00:00<00:00, 2933874.36it/s]


## Annexe: Submition d'un modèle ad-hoc

Entraînement du modèle choisi sur tout le train set

In [89]:
from sklearn.pipeline import Pipeline

train = data_train_without_nan_for_label.copy()
train = pre_processing(train, remove_espace=False, not_test=False)
train = add_alphabet_to_label(train)
# train['Text'] = train['Text'].progress_apply(sentencepiece_tokenize)
x = train['Text'].tolist()
y = train['Label'].tolist()

vectorizer= TfidfVectorizer(analyzer="char", ngram_range=(1, 5), max_features=200000)
sgd_classifier = SGDClassifier(alpha=0.0001, loss='log_loss', penalty='l2')

best_pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('mnb', sgd_classifier)
])

from sklearn.preprocessing import LabelEncoder
le_test = LabelEncoder()
y = le_test.fit_transform(y)
label_mapping_test = dict(zip(le_test.classes_, range(len(le_test.classes_))))

best_pipeline.fit(x, y)

100%|██████████| 190099/190099 [00:07<00:00, 25886.97it/s]


Prédiction des labels pour le test et génération du csv à déposer

In [90]:
data_test= pd.read_csv(file_path_test)
test_set = pre_processing(data_test, remove_espace=False, not_test=False)
# test_set['Text'] = test_set['Text'].progress_apply(sentencepiece_tokenize)

x_test = test_set['Text'].tolist()
predictions_test = best_pipeline.predict(x_test)

In [79]:
test_set.to_csv('test_preprocessed.csv', index=False)

In [91]:
predicted_labels_test = le_test.inverse_transform(predictions_test)
predicted_labels_test = restore_labels(predicted_labels_test)
test_set['Label'] = predicted_labels_test

100%|██████████| 190567/190567 [00:00<00:00, 1810442.61it/s]


In [92]:
column_ID = [i for i in range(1, len(test_set)+1)]
test_set['ID'] = column_ID

In [93]:
test_set[['ID', 'Label']].to_csv('test_set_v10_sans_tokenizer_predicted.csv', index=False)