In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report



In [2]:
# List files to verify upload
print(os.listdir('data_filled'))

['emailsAcademic.xlsx', 'emailsFAQ.xlsx', 'emailsWorkPermit.xlsx', 'FAQs_scraped.xlsx']


In [3]:
def load_data(folder_path='data_filled'):
    all_data = []
    for file in os.listdir(folder_path):
        if file.endswith(('.xlsx', '.csv')):
            file_path = os.path.join(folder_path, file)
            df = pd.read_excel(file_path) if file.endswith('.xlsx') else pd.read_csv(file_path)
            all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

# Clean text: lowercase, remove punctuation/numbers
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Load and clean
combined_df = load_data()
combined_df['Cleaned_Content'] = combined_df['Content'].apply(clean_text)
combined_df = combined_df.dropna(subset=['Category'])  # Remove rows with missing labels

# Preview
print(combined_df[['Content', 'Cleaned_Content', 'Category']].head())

                                             Content  \
0  I need to plan my schedule for next semester. ...   
1  I want to drop a course this semester. Could y...   
2  I would like to confirm my enrolled courses. W...   
3  I need to check how many credits I have comple...   
4  I want to see my class schedule for this semes...   

                                     Cleaned_Content  Category  
0  i need to plan my schedule for next semester c...  Academic  
1  i want to drop a course this semester could yo...  Academic  
2  i would like to confirm my enrolled courses wh...  Academic  
3  i need to check how many credits i have comple...  Academic  
4  i want to see my class schedule for this semes...  Academic  


In [4]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(combined_df['Cleaned_Content'])
y = combined_df['Category']

# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

Training samples: 91, Test samples: 23


In [5]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = nb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6521739130434783
              precision    recall  f1-score   support

    Academic       0.58      0.70      0.64        10
         FAQ       0.70      0.70      0.70        10
 work permit       1.00      0.33      0.50         3

    accuracy                           0.65        23
   macro avg       0.76      0.58      0.61        23
weighted avg       0.69      0.65      0.65        23



In [6]:
# Add this after your classification report
print("Class Distribution:")
print(y.value_counts(normalize=True))

Class Distribution:
Category
Academic       0.412281
FAQ            0.412281
work permit    0.175439
Name: proportion, dtype: float64


In [7]:
!pip install imbalanced-learn



In [8]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)  # Balances classes

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (c:\Users\USER\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)