In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# Specify the file path
file_path_test = "unlabelled_test_data.csv"
file_path_training = "training_data.csv"

# Read the CSV file
test_data = pd.read_csv(file_path_test)
training_data = pd.read_csv(file_path_training)

test_data.head()

Unnamed: 0,id,sentence
0,0,Nous dûmes nous excuser des propos que nous eû...
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,2,"Et, paradoxalement, boire froid n'est pas la b..."
3,3,"Ce n'est pas étonnant, car c'est une saison my..."
4,4,"Le corps de Golo lui-même, d'une essence aussi..."


In [3]:
training_data.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [4]:
# Check for null values in the training data
training_data.isnull().sum()

# Check for null values in the test data
test_data.isnull().sum()


id          0
sentence    0
dtype: int64

In [5]:
# Distribution of difficulty levels
difficulty_distribution = training_data['difficulty'].value_counts()
# Sample sentences
sample_sentences = training_data['sentence'].sample(5, random_state=42)

difficulty_distribution, sample_sentences


(difficulty
 A1    813
 C2    807
 C1    798
 B1    795
 A2    795
 B2    792
 Name: count, dtype: int64,
 596                               Je peux m'asseoir ici ?
 3370    C'est la couleur de nombreux fruits et légumes...
 3048    Pas au point qu'il faille en limiter la consom...
 2908    Les Français ne cèdent pas au chacun pour soi,...
 8       J'ai retrouvé le plaisir de manger un oeuf à l...
 Name: sentence, dtype: object)

In [6]:
# Convert everything to lowercase
training_data['sentence'] = training_data['sentence'].str.lower()

# Remove special characters and numbers
training_data['sentence'] = training_data['sentence'].str.replace('[^a-zA-Z\s]', '', regex=True)

training_data.head()


Unnamed: 0,id,sentence,difficulty
0,0,les cots kilomtriques rels peuvent diverger se...,C1
1,1,le bleu cest ma couleur prfre mais je naime pa...,A1
2,2,le test de niveau en franais est sur le site i...,A1
3,3,estce que ton mari est aussi de boston,A1
4,4,dans les coles de commerce dans les couloirs d...,B1


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(training_data['sentence'])
y = training_data['difficulty']


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((3840, 5000), (960, 5000))

In [9]:
from sklearn.svm import SVC

# Create an instance of the SVM model
svm_model = SVC()

# Train the model on the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test data
predictions = svm_model.predict(X_test)

predictions


array(['A1', 'A1', 'C2', 'B2', 'A2', 'C1', 'A2', 'A2', 'A2', 'C1', 'A1',
       'A2', 'B1', 'B2', 'C2', 'C1', 'A1', 'C2', 'A1', 'A1', 'B1', 'C1',
       'B1', 'C2', 'A1', 'A2', 'B1', 'A2', 'C2', 'A1', 'C1', 'B2', 'B1',
       'B2', 'A2', 'B2', 'A2', 'B2', 'C1', 'A2', 'C2', 'A1', 'C2', 'A1',
       'C2', 'B1', 'A1', 'B2', 'B2', 'C1', 'C1', 'A2', 'C1', 'A2', 'C1',
       'A2', 'C1', 'B2', 'A2', 'B1', 'C1', 'C2', 'A2', 'C2', 'C1', 'C1',
       'C2', 'A2', 'B2', 'B1', 'C2', 'B2', 'B1', 'B2', 'C2', 'C2', 'A2',
       'A1', 'A1', 'A1', 'B1', 'B2', 'C1', 'C1', 'C2', 'A1', 'B2', 'C2',
       'A1', 'A1', 'B2', 'C2', 'C2', 'A1', 'A1', 'B1', 'A2', 'A2', 'A2',
       'B2', 'B2', 'C2', 'C2', 'A1', 'C1', 'A1', 'A2', 'C1', 'B1', 'C2',
       'C1', 'A2', 'A2', 'A1', 'C1', 'A2', 'C1', 'B2', 'A1', 'A1', 'A1',
       'A1', 'B1', 'C2', 'C2', 'A2', 'C1', 'B2', 'B2', 'B1', 'C1', 'C1',
       'A1', 'B1', 'A1', 'B2', 'C2', 'A2', 'A1', 'C2', 'B2', 'C2', 'A2',
       'A1', 'A2', 'B1', 'C2', 'A1', 'B1', 'C1', 'C

In [10]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, predictions)

accuracy


0.4552083333333333

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for grid search
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}

# Create an instance of the SVM model
svm_model = SVC()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(svm_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a new SVM model with the best hyperparameters
svm_model = SVC(**best_params)

# Train the model on the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test data
new_predictions = svm_model.predict(X_test)

new_predictions


array(['A1', 'A1', 'C2', 'B2', 'A2', 'C1', 'A2', 'A2', 'A2', 'C1', 'A1',
       'A2', 'B1', 'B2', 'C2', 'C2', 'A1', 'C2', 'A1', 'A1', 'B1', 'C1',
       'B1', 'C2', 'A1', 'A2', 'B1', 'A2', 'C2', 'A1', 'C1', 'B2', 'B1',
       'B2', 'A2', 'B2', 'A2', 'B2', 'C1', 'A1', 'C2', 'A1', 'C2', 'B1',
       'C2', 'B1', 'A1', 'B2', 'C1', 'C1', 'C1', 'A1', 'C1', 'A2', 'C1',
       'A2', 'B2', 'B2', 'A2', 'B1', 'C1', 'C2', 'A2', 'C2', 'C1', 'C1',
       'C2', 'A2', 'B2', 'B1', 'C2', 'B2', 'B1', 'B2', 'C2', 'C2', 'A1',
       'A1', 'A1', 'A1', 'A2', 'B2', 'C1', 'C1', 'C2', 'A1', 'B2', 'C1',
       'A1', 'A1', 'B1', 'C2', 'C2', 'A1', 'A1', 'B1', 'A2', 'A2', 'B1',
       'B2', 'B2', 'C2', 'C2', 'A1', 'B1', 'A1', 'A2', 'C1', 'B1', 'C2',
       'B1', 'A2', 'A2', 'A1', 'C1', 'A2', 'C1', 'B2', 'A1', 'A1', 'A1',
       'A1', 'B1', 'C2', 'C2', 'A2', 'C1', 'B2', 'B2', 'C1', 'C1', 'C1',
       'A1', 'A1', 'A1', 'B2', 'C2', 'A2', 'A1', 'C2', 'A2', 'B2', 'B1',
       'A1', 'A1', 'A2', 'C2', 'A1', 'A2', 'C1', 'C

In [12]:
from sklearn.metrics import accuracy_score

# Make predictions on the test data
predictions = svm_model.predict(X_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, new_predictions)

accuracy


0.46041666666666664

In [14]:
# Transform the cleaned text data of test_data
X_test_data = vectorizer.transform(test_data['sentence'])

# Make predictions on the test data
test_predictions = svm_model.predict(X_test_data)

test_predictions


array(['C2', 'B1', 'A2', ..., 'C2', 'A1', 'A2'], dtype=object)

In [16]:
# Create a DataFrame with the test_predictions
output_data = pd.DataFrame({'id': test_data['id'], 'difficulty': test_predictions})

# Save the DataFrame to a CSV file
output_data.to_csv('output.csv', index=False)
