## Load the enhanced sets

In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from ast import literal_eval

# Load data from CSV files
data = pd.read_csv('/home/nathan/OneDrive/GitHub/Nvidia/data/Cleaned_Enhanced_training.csv')
test_data = pd.read_csv('/home/nathan/OneDrive/GitHub/Nvidia/data/Cleaned_Enhanced_test.csv')

# Convert string representations of lists back to actual lists
data['embeddings'] = data['embeddings'].apply(literal_eval)
test_data['embeddings'] = test_data['embeddings'].apply(literal_eval)

# Flatten the embeddings
num_embedding_features = len(data['embeddings'].iloc[0])
for i in range(num_embedding_features):
    data[f'emb_{i}'] = data['embeddings'].apply(lambda x: x[i])
    test_data[f'emb_{i}'] = test_data['embeddings'].apply(lambda x: x[i])

# Drop the original embeddings column and other non-feature columns
data.drop(['embeddings', 'sentence', 'id', 'difficulty'], axis=1, inplace=True)
test_data.drop(['embeddings', 'sentence', 'id'], axis=1, inplace=True)


In [2]:
#print(test_data.info(), data.info())
print(data.head())

   difficulty_encoded  LEN       AWL       TTR   ASL  AVPS  ASL.AVPS  \
0                   4   44  4.954545  0.704545  44.0   4.0     176.0   
1                   0   14  3.642857  1.000000  14.0   2.0      28.0   
2                   0   14  3.857143  0.928571  14.0   1.0      14.0   
3                   0    9  3.666667  1.000000   9.0   1.0       9.0   
4                   2   39  4.564103  0.794872  39.0   4.0     156.0   

        mtld  num_subordinate_clauses  
0  44.888889                        0  
1  54.880000                        0  
2  27.440000                        0  
3   8.000000                        0  
4  28.495275                        0  


## Prepare data for training and validation

In [3]:
# Preparing the data for training and validation
X = data.drop('difficulty_encoded', axis=1)
y = data['difficulty_encoded']

# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.01, random_state=42)

## Hyperparameter tuning, training and validation of training data

In [6]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto'],  # Kernel coefficient
}

# Create a GridSearchCV object for an SVM with RBF kernel
grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5, verbose=2, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters found: ", grid_search.best_params_)

# Use the best estimator for validation
best_svm = grid_search.best_estimator_

# Classification Metrics on Validation Set
val_predictions = best_svm.predict(X_val)
print("Classification Report on Validation Set:")
print(classification_report(y_val, val_predictions))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .................................C=0.1, gamma=scale; total time=   1.6s
[CV] END ..................................C=0.1, gamma=auto; total time=   1.6s
[CV] END ..................................C=0.1, gamma=auto; total time=   1.6s
[CV] END ..................................C=0.1, gamma=auto; total time=   1.8s
[CV] END .................................C=0.1, gamma=scale; total time=   1.8s
[CV] END .................................C=0.1, gamma=scale; total time=   1.8s
[CV] END .................................C=0.1, gamma=scale; total time=   2.0s
[CV] END .................................C=0.1, gamma=scale; total time=   2.1s
[CV] END ..................................C=0.1, gamma=auto; total time=   1.6s
[CV] END ..................................C=0.1, gamma=auto; total time=   1.8s
[CV] END ...................................C=1, gamma=scale; total time=   1.9s
[CV] END ...................................C=1, 

### Predictions on test dataset

In [4]:
# Prepare test data (for which we don't have labels)
X_test = test_data
X_test_scaled = scaler.transform(X_test)  # Use the same scaler as for the training data

# Make predictions on test data
test_predictions = best_svm.predict(X_test_scaled)

# Display predictions for test data
for idx, prediction in enumerate(test_predictions):
    print(f"Test Data ID {idx}: Predicted Difficulty: {prediction}")


NameError: name 'best_svm' is not defined

In [12]:
cefr_mapping = {0: 'A1', 1: 'A2', 2: 'B1', 3: 'B2', 4: 'C1', 5: 'C2'}

# Re-read the test_data to get the 'id' column back
test_data = pd.read_csv('/home/nathan/OneDrive/GitHub/Nvidia/data/Cleaned_Enhanced_test.csv')

# Apply the mapping to your predictions
test_data['difficulty'] = test_predictions
test_data['difficulty'] = test_data['difficulty'].map(cefr_mapping)

# Save the 'id' and 'CEFR_difficulty' columns to a new CSV file
test_data[['id', 'difficulty']].to_csv('Nvidia_submission.csv', index=False)

print("Predictions saved to Nvidia_submission.csv")

Predictions saved to Nvidia_submission.csv
