<a href="https://colab.research.google.com/github/page-jerzak/ai_computing/blob/main/Mod15Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Step 1: Load the datasets
training_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Step 2: Preprocess the data (vectorization included)
X = training_data['StudentComments']
y = training_data['Sentiment']

# Split the training data for evaluation purposes
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to train
models = {
    "Support Vector Machine": SVC(),
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# Vectorizer: Using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Store results
metrics = []

for model_name, model in models.items():
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', model)
    ])
    # Train the model
    pipeline.fit(X_train, y_train)
    # Validate the model
    y_pred = pipeline.predict(X_val)
    # Get metrics
    report = classification_report(y_val, y_pred, output_dict=True, zero_division=0)
    accuracy = accuracy_score(y_val, y_pred)
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    metrics.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall
    })

# Convert metrics to a DataFrame for better visualization
metrics_df = pd.DataFrame(metrics)
print(metrics_df)

# Step 3: Select the best model based on accuracy
best_model_row = metrics_df.loc[metrics_df['Accuracy'].idxmax()]
best_model_name = best_model_row['Model']
print(f"\nBest Model: {best_model_name} with accuracy {best_model_row['Accuracy']}")

best_model_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', models[best_model_name])
])
best_model_pipeline.fit(X, y)

# Step 4: Predict sentiments for the test data
test_predictions = best_model_pipeline.predict(test_data['StudentComments'])

# Step 5: Save the results to a CSV file
output = pd.DataFrame({
    'ID': test_data['ID'],
    'Sentiment': test_predictions
})
output.to_csv('predicted_sentiments.csv', index=False)
print("\nPredictions saved to 'predicted_sentiments.csv'")


                    Model  Accuracy  Precision    Recall
0  Support Vector Machine  0.894587   0.896721  0.894587
1             Naive Bayes  0.891738   0.893866  0.891738
2           Decision Tree  0.749288   0.749205  0.749288
3     Logistic Regression  0.888889   0.890139  0.888889

Best Model: Support Vector Machine with accuracy 0.8945868945868946

Predictions saved to 'predicted_sentiments.csv'
