In an effort to enhance the accuracy of our model, we will augment the training set by incorporating additional attributes, such as the word count of each sentence or its complexity. This expanded feature set aims to provide the model with richer information, ultimately contributing to improved performance in our text classification task.

imports

In [23]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import string
import numpy as np
import spacy
nlp = spacy.load('fr_core_news_sm')


from spellchecker import SpellChecker
from spellchecker import SpellChecker
spacy.cli.download("fr_core_news_sm")
nlp = spacy.load("fr_core_news_sm")

[nltk_data] Downloading package stopwords to /Users/phil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/phil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Collecting fr-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.5.0/fr_core_news_sm-3.5.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [24]:
# Specify the file path
file_path_training = "../Dataset/training_data.csv"
file_path_test = "../Dataset/unlabelled_test_data.csv"


# Read the CSV file
training_data = pd.read_csv(file_path_training, index_col=0)
training_data.head()

Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
0,Nous dûmes nous excuser des propos que nous eû...
1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,"Et, paradoxalement, boire froid n'est pas la b..."
3,"Ce n'est pas étonnant, car c'est une saison my..."
4,"Le corps de Golo lui-même, d'une essence aussi..."


**SPELL CHECKING**

In [25]:
import language_tool_python

# Initialisation des outils
tool = language_tool_python.LanguageTool('fr')

def evaluer_orthographe_syntaxe(texte):
    # Vérification avec LanguageTool
    erreurs_language_tool = tool.check(texte)

    # Compter les différents types d'erreurs
    erreurs_orthographe = sum(1 for erreur in erreurs_language_tool if 'ORTHOGRAPH' in erreur.ruleId)
    erreurs_grammaire = sum(1 for erreur in erreurs_language_tool if 'GRAMMAR' in erreur.ruleId)

    # Analyse syntaxique avec spaCy
    doc = nlp(texte)
    erreurs_syntaxe = sum(1 for token in doc if token.dep_ == "nsubj" and token.head.pos_ != 'VERB')

    # Calcul de la note
    seuil_minimal_mots = 5
    nombre_mots = max(len(texte.split()), seuil_minimal_mots)
    poids_orthographe = 1.0  # Ajuster selon l'importance relative
    poids_grammaire = 1.5  # Les erreurs grammaticales peuvent être plus graves
    note_globale = max(1 - ((erreurs_orthographe * poids_orthographe + erreurs_grammaire * poids_grammaire + erreurs_syntaxe) / nombre_mots), 0)

    return note_globale

# Exemple d'utilisation
training_data['note_orthographe'] = training_data['sentence'].apply(evaluer_orthographe_syntaxe)

**Diversite_lexicale_complexite**

The provided code defines a set of functions to preprocess and analyze text data's lexical diversity and complexity. The preprocess_text function tokenizes a given text, converts words to lowercase. The diversite_lexicale_complexite function calculates the lexical diversity and complexity of a text, considering factors like the average length of words and phrases. 

The resulting complexity values are stored in a new column named 'lexical_complexite.' Overall, this code aids in extracting linguistic features and assessing the linguistic richness and complexity of French text data.

In [26]:
# Stopwords
stopwords_french = set(stopwords.words('french'))

def preprocess_text(texte, remove_stopwords=True):
    if not isinstance(texte, str):
        raise ValueError("Le texte doit être une chaîne de caractères.")

    mots = word_tokenize(texte, language='french')
    mots_low = [mot.lower() for mot in mots if mot.isalpha()] 

    if remove_stopwords:
        mots_low = [mot for mot in mots_low if mot not in stopwords_french]

    return mots_low

def diversite_lexicale_complexite(texte, remove_stopwords=True):
    phrases = sent_tokenize(texte, language='french')
    mots_low = preprocess_text(texte, remove_stopwords)
    
    if not mots_low or not phrases:
        return float(0) 
    
    nb_mots = len(mots_low)
    nb_phrases = len(phrases)
    longueur_moyenne_mot = sum(len(mot) for mot in mots_low) / nb_mots
    longueur_moyenne_phrase = sum(len(phrase.split()) for phrase in phrases) / nb_phrases

    lexical_diversity = len(set(mots_low)) / nb_mots

    # Complexity factor based on average word and sentence length
    complexite = lexical_diversity * (1 + (longueur_moyenne_mot / 5)) * (1 + (longueur_moyenne_phrase / 10))
    
    return complexite

training_data['lexical_complexite'] = training_data['sentence'].apply(diversite_lexicale_complexite)

In [27]:
training_data.head()

Unnamed: 0_level_0,sentence,note_orthographe,lexical_complexite
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Nous dûmes nous excuser des propos que nous eû...,0.9,4.7
1,Vous ne pouvez pas savoir le plaisir que j'ai ...,1.0,5.485714
2,"Et, paradoxalement, boire froid n'est pas la b...",1.0,4.56
3,"Ce n'est pas étonnant, car c'est une saison my...",0.888889,4.56
4,"Le corps de Golo lui-même, d'une essence aussi...",1.0,18.204


**OTHER ATTRIBUTES**

This code offers a set of functions to extract linguistic features, including sentence and word metrics, lexical complexity, and POS tagging distribution.

In [28]:
# SENTENCE LENGTH
def sentence_length(sentence):
    return len(sentence.split())

# WORD LENGTH
def average_word_length(sentence):
    words = sentence.split()
    return np.mean([len(word) for word in words]) if words else 0

def type_token_ratio(sentence):
    words = sentence.split()
    return len(set(words)) / len(words) if words else 0

# COMPLEXITE LEXICALE
def complexite_texte(texte):
    doc = nlp(texte)

    # Syntactic measurements
    nb_phrases = len(list(doc.sents))
    profondeur_moyenne = sum(len(list(phrase.root.subtree)) for phrase in doc.sents) / nb_phrases if nb_phrases > 0 else 0

    # Grammatical measures
    temps_verbaux = {mot.tag_: 0 for mot in doc if mot.tag_ and "VERB" in mot.tag_}
    for mot in doc:
        if mot.tag_ and "VERB" in mot.tag_:
            temps_verbaux[mot.tag_] += 1
    diversite_temps_verbaux = len(temps_verbaux)

    complexite = profondeur_moyenne + diversite_temps_verbaux

    return complexite

# POS TAGGING
def pos_tag_distribution(sentence):
    if not isinstance(sentence, str):
        raise ValueError("L'entrée doit être une chaîne de caractères.")

    doc = nlp(sentence)
    pos_counts = {pos: 0 for pos in [token.pos_ for token in doc]}  

    for token in doc:
        pos = token.pos_
        pos_counts[pos] += 1

    # Optional: normalize by total number of words
    total_mots = len(doc)
    if total_mots > 0:
        pos_counts_normalized = {pos: count / total_mots for pos, count in pos_counts.items()}
        return pos_counts_normalized

    return pos_counts

In [29]:
training_data['char_length'] = training_data['sentence'].apply(len)
training_data['word_length'] = training_data['sentence'].apply(lambda x: len(x.split()))
training_data['type_token_ratio'] = training_data['sentence'].apply(type_token_ratio)

training_data['sentence_length'] = training_data['sentence'].apply(sentence_length)
training_data['avg_word_length'] = training_data['sentence'].apply(average_word_length)
training_data['complexite_texte'] = training_data['sentence'].apply(complexite_texte)
training_data['pos_tags'] = training_data['sentence'].apply(pos_tag_distribution)

In [30]:
unique_pos_tags = set()
for pos_tags_dict in training_data['pos_tags']:
    unique_pos_tags.update(pos_tags_dict.keys())

# Initialize columns for each POS tag with default value 0
for tag in ['PUNCT', 'ADV', 'CCONJ', 'X', 'AUX', 'DET', 'PRON', 'NUM', 'NOUN', 'INTJ', 'ADP', 'ADJ', 'VERB', 'PROPN', 'SCONJ']:
    training_data[tag] = 0

# Populate the columns with counts
for index, row in training_data.iterrows():
    for tag, count in row['pos_tags'].items():
        if tag in training_data.columns:
            training_data.at[index, tag] = count

training_data = training_data.drop(['pos_tags'], axis=1)

  training_data.at[index, tag] = count
  training_data.at[index, tag] = count
  training_data.at[index, tag] = count
  training_data.at[index, tag] = count
  training_data.at[index, tag] = count
  training_data.at[index, tag] = count
  training_data.at[index, tag] = count
  training_data.at[index, tag] = count
  training_data.at[index, tag] = count
  training_data.at[index, tag] = count


In [31]:
training_data.head()

Unnamed: 0_level_0,sentence,note_orthographe,lexical_complexite,char_length,word_length,type_token_ratio,sentence_length,avg_word_length,complexite_texte,PUNCT,...,DET,PRON,NUM,NOUN,INTJ,ADP,ADJ,VERB,PROPN,SCONJ
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Nous dûmes nous excuser des propos que nous eû...,0.9,4.7,59,10,0.9,10,5.0,11.0,0.0,...,0.0,0.4,0.0,0.1,0,0.1,0.1,0.3,0.0,0.0
1,Vous ne pouvez pas savoir le plaisir que j'ai ...,1.0,5.485714,79,14,1.0,14,4.714286,17.0,0.0625,...,0.125,0.1875,0.0,0.0625,0,0.0625,0.125,0.25,0.0,0.0
2,"Et, paradoxalement, boire froid n'est pas la b...",1.0,4.56,58,9,1.0,9,5.555556,14.0,0.230769,...,0.076923,0.0,0.0,0.076923,0,0.0,0.153846,0.076923,0.0,0.0
3,"Ce n'est pas étonnant, car c'est une saison my...",0.888889,4.56,55,9,1.0,9,5.222222,12.0,0.083333,...,0.083333,0.166667,0.0,0.083333,0,0.0,0.166667,0.0,0.0,0.0
4,"Le corps de Golo lui-même, d'une essence aussi...",1.0,18.204,460,72,0.791667,72,5.402778,84.0,0.072289,...,0.156627,0.120482,0.0,0.156627,0,0.120482,0.084337,0.096386,0.012048,0.036145


NORMALIZATION

In [32]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

if 'difficulty' in training_data.columns:
    label_encoder = LabelEncoder()
    training_data['difficulty'] = label_encoder.fit_transform(training_data['difficulty'])

numerical_features = ['lexical_complexite', 'note_orthographe', 'char_length', 'word_length', 'type_token_ratio', 'sentence_length', 'avg_word_length', 'complexite_texte']

# scaler MinMax
scaler = MinMaxScaler()

training_data[numerical_features] = scaler.fit_transform(training_data[numerical_features])


**Save File --> csv**

In [33]:
training_data.to_csv('training_dataUP.csv')