## Load the enhanced sets

In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from ast import literal_eval

# Load data from CSV files
data = pd.read_csv('/home/nathan/OneDrive/GitHub/Nvidia/data/Cleaned_Enhanced_training.csv')
test_data = pd.read_csv('/home/nathan/OneDrive/GitHub/Nvidia/data/Cleaned_Enhanced_test.csv')

# Convert string representations of lists back to actual lists
data['embeddings'] = data['embeddings'].apply(literal_eval)
test_data['embeddings'] = test_data['embeddings'].apply(literal_eval)

# Flatten the embeddings
num_embedding_features = len(data['embeddings'].iloc[0])
for i in range(num_embedding_features):
    data[f'emb_{i}'] = data['embeddings'].apply(lambda x: x[i])
    test_data[f'emb_{i}'] = test_data['embeddings'].apply(lambda x: x[i])

k_fold = data
# Drop the original embeddings column and other non-feature columns
data.drop(['embeddings', 'sentence', 'id', 'difficulty'], axis=1, inplace=True)
test_data.drop(['embeddings', 'sentence', 'id'], axis=1, inplace=True)



  data[f'emb_{i}'] = data['embeddings'].apply(lambda x: x[i])
  test_data[f'emb_{i}'] = test_data['embeddings'].apply(lambda x: x[i])
  data[f'emb_{i}'] = data['embeddings'].apply(lambda x: x[i])
  test_data[f'emb_{i}'] = test_data['embeddings'].apply(lambda x: x[i])
  data[f'emb_{i}'] = data['embeddings'].apply(lambda x: x[i])
  test_data[f'emb_{i}'] = test_data['embeddings'].apply(lambda x: x[i])
  data[f'emb_{i}'] = data['embeddings'].apply(lambda x: x[i])
  test_data[f'emb_{i}'] = test_data['embeddings'].apply(lambda x: x[i])
  data[f'emb_{i}'] = data['embeddings'].apply(lambda x: x[i])
  test_data[f'emb_{i}'] = test_data['embeddings'].apply(lambda x: x[i])
  data[f'emb_{i}'] = data['embeddings'].apply(lambda x: x[i])
  test_data[f'emb_{i}'] = test_data['embeddings'].apply(lambda x: x[i])
  data[f'emb_{i}'] = data['embeddings'].apply(lambda x: x[i])
  test_data[f'emb_{i}'] = test_data['embeddings'].apply(lambda x: x[i])
  data[f'emb_{i}'] = data['embeddings'].apply(lambda x: x[i])


## Prepare data for training and validation

In [2]:
# Preparing the data for training and validation
X = data.drop('difficulty_encoded', axis=1)
y = data['difficulty_encoded']

# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and validation sets
#X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.01, random_state=None)

In [3]:
from sklearn.model_selection import cross_val_score


svm_model = SVC(kernel='rbf')

# Number of folds for cross-validation
k = 5

# Perform K-fold cross-validation
cv_scores = cross_val_score(svm_model, X, y, cv=k)

cv_scores.mean(), cv_scores.std()

(0.354375, 0.0060164704492473514)

## Hyperparameter tuning, training and validation of training data

In [4]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto'],  # Kernel coefficient
}

# Create a GridSearchCV object for an SVM with RBF kernel
grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5, verbose=2, n_jobs=-1)

# Perform grid search
grid_search.fit(X_scaled, y)

# Print best parameters
print("Best parameters found: ", grid_search.best_params_)

# Use the best estimator for validation
best_svm = grid_search.best_estimator_

# Classification Metrics on Validation Set
#val_predictions = best_svm.predict(X_val)
#print("Classification Report on Validation Set:")
#print(classification_report(y_val, val_predictions))

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
[CV] END .................................C=0.1, gamma=scale; total time= 1.4min
[CV] END .................................C=0.1, gamma=scale; total time= 1.5min
[CV] END .................................C=0.1, gamma=scale; total time= 1.5min
[CV] END .................................C=0.1, gamma=scale; total time= 1.6min
[CV] END .................................C=0.1, gamma=scale; total time= 1.6min
[CV] END .................................C=0.1, gamma=scale; total time= 1.6min
[CV] END .................................C=0.1, gamma=scale; total time= 1.6min
[CV] END .................................C=0.1, gamma=scale; total time= 1.7min
[CV] END .................................C=0.1, gamma=scale; total time= 1.7min
[CV] END ..................................C=0.1, gamma=auto; total time= 1.6min
[CV] END .................................C=0.1, gamma=scale; total time= 1.7min
[CV] END ..................................C=0.1

### Predictions on test dataset

In [5]:
# Prepare test data (for which we don't have labels)
X_test = test_data
X_test_scaled = scaler.transform(X_test)  # Use the same scaler as for the training data

# Make predictions on test data
test_predictions = best_svm.predict(X_test_scaled)

# Display predictions for test data
for idx, prediction in enumerate(test_predictions):
    print(f"Test Data ID {idx}: Predicted Difficulty: {prediction}")


Test Data ID 0: Predicted Difficulty: 5
Test Data ID 1: Predicted Difficulty: 2
Test Data ID 2: Predicted Difficulty: 3
Test Data ID 3: Predicted Difficulty: 1
Test Data ID 4: Predicted Difficulty: 5
Test Data ID 5: Predicted Difficulty: 3
Test Data ID 6: Predicted Difficulty: 0
Test Data ID 7: Predicted Difficulty: 1
Test Data ID 8: Predicted Difficulty: 5
Test Data ID 9: Predicted Difficulty: 2
Test Data ID 10: Predicted Difficulty: 0
Test Data ID 11: Predicted Difficulty: 1
Test Data ID 12: Predicted Difficulty: 3
Test Data ID 13: Predicted Difficulty: 4
Test Data ID 14: Predicted Difficulty: 0
Test Data ID 15: Predicted Difficulty: 1
Test Data ID 16: Predicted Difficulty: 3
Test Data ID 17: Predicted Difficulty: 0
Test Data ID 18: Predicted Difficulty: 0
Test Data ID 19: Predicted Difficulty: 0
Test Data ID 20: Predicted Difficulty: 5
Test Data ID 21: Predicted Difficulty: 3
Test Data ID 22: Predicted Difficulty: 3
Test Data ID 23: Predicted Difficulty: 4
Test Data ID 24: Predicted

In [6]:
cefr_mapping = {0: 'A1', 1: 'A2', 2: 'B1', 3: 'B2', 4: 'C1', 5: 'C2'}

# Re-read the test_data to get the 'id' column back
test_data = pd.read_csv('/home/nathan/OneDrive/GitHub/Nvidia/data/Cleaned_Enhanced_test.csv')

# Apply the mapping to your predictions
test_data['difficulty'] = test_predictions
test_data['difficulty'] = test_data['difficulty'].map(cefr_mapping)

# Save the 'id' and 'CEFR_difficulty' columns to a new CSV file
test_data[['id', 'difficulty']].to_csv('Nvidia_submission.csv', index=False)

print("Predictions saved to Nvidia_submission.csv")

Predictions saved to Nvidia_submission.csv


In [7]:
print(data['embeddings'].head())

KeyError: 'embeddings'

## K-cross validation

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from ast import literal_eval

data = pd.read_csv('/home/nathan/OneDrive/GitHub/Nvidia/data/Cleaned_Enhanced_training.csv')
# Convert the 'embeddings' from string representation to numerical format
data['embeddings'] = data['embeddings'].apply(literal_eval)

# Checking the first entry to ensure correct conversion
print("Sample Embedding (Post Conversion):", data['embeddings'].iloc[0])

X = data[['embeddings', 'LEN', 'DCRS', 'FKG', 'TTR', 'ARI', 'CLI', 'ASL']] # Update this line with your feature names
X = X.apply(lambda x: np.hstack(x), axis=1).tolist() # Combine embeddings with new features
X = np.array(X)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import PolynomialFeatures

y = data['difficulty_encoded']

# Define the SVM model with RBF kernel
svm_model = SVC(kernel='rbf')

# Number of folds for cross-validation
k = 5

# Perform K-fold cross-validation
cv_scores = cross_val_score(svm_model, X_scaled, y, cv=k)

cv_scores.mean(), cv_scores.std()

In [None]:

from sklearn.model_selection import GridSearchCV

# Define parameter range for grid search
param_grid = {
    'C': [0.1, 1, 10, 100],  # Example values, adjust as needed
    'gamma': [0.001, 0.01, 0.1, 1]  # Example values, adjust as needed
}

# Create a grid search object
grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=k)

# Perform grid search
grid_search.fit(X_scaled, y)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)


In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
poly = PolynomialFeatures(degree=2)  # You can adjust the degree
X_poly = poly.fit_transform(X_scaled)