In [372]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('dataset.csv')


In [373]:
df

Unnamed: 0,Sr. No.,Subject,Content
0,1,Mathematics,Pythagoras' theorem states that in a right-ang...
1,2,Science,"The theory of relativity, developed by Albert ..."
2,3,History,"The Byzantine Empire, also known as the Easter..."
3,4,Geography,The Amazon River is the second-longest river i...
4,5,Literature,Pride and Prejudice is a novel written by Jane...
...,...,...,...
459,554,Literature,J.R.R. Tolkien's epic fantasy novel 'The Lord ...
460,555,Literature,Leo Tolstoy's novel 'War and Peace' is a spraw...
461,546,Literature,William Shakespeare's play 'Hamlet' is a timel...
462,547,Literature,Jane Austen's novel 'Pride and Prejudice' is a...


In [374]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Load the NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the dataset
df = pd.read_csv('dataset.csv')

# Function used to clean text data
def clean_text(text):
    # Remove special characters, punctuation, and numbers
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    cleaned_text = ' '.join(word for word in cleaned_text.split() if word not in stop_words)
    return cleaned_text

# Apply cleaning function to 'Content' column
df['Cleaned_Content'] = df['Content'].apply(clean_text)

# Display the cleaned data
print(df[['Content', 'Cleaned_Content']].head())

                                             Content  \
0  Pythagoras' theorem states that in a right-ang...   
1  The theory of relativity, developed by Albert ...   
2  The Byzantine Empire, also known as the Easter...   
3  The Amazon River is the second-longest river i...   
4  Pride and Prejudice is a novel written by Jane...   

                                     Cleaned_Content  
0  pythagoras theorem states rightangled triangle...  
1  theory relativity developed albert einstein ea...  
2  byzantine empire also known eastern roman empi...  
3  amazon river secondlongest river world flowing...  
4  pride prejudice novel written jane austen sati...  


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/palvishroff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [375]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Split the dataset into features and labels where 'Cleaned_Content' is considered as features and 'Subject' as labels.
X = df['Cleaned_Content']
y = df['Subject']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the cleaned text data (The TF-IDF vectorizer is initialized to convert text data into numerical vectors.)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Initialize and train the classifier
classifier = SVC(kernel='linear')
classifier.fit(X_train_vec, y_train)

# Make predictions on the testing set
y_pred = classifier.predict(X_test_vec)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.967741935483871


In [376]:
from sklearn.model_selection import GridSearchCV

# The param_grid dictionary defines the grid of hyperparameters to search through. 
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly']
}

# Initialize the grid search with cross-validation
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')

# Perform grid search to find the best parameters
grid_search.fit(X_train_vec, y_train)

# Get the best parameters and best accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)


Best Parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best Accuracy: 0.937981981981982


In [377]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

# Split the dataset into features and labels
X = df['Cleaned_Content']
y = df['Subject']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the cleaned text data
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Initialize and train the teacher model(This model will serve as the teacher model for knowledge distillation.)
teacher_classifier = SVC(kernel='linear', probability=True)
teacher_classifier.fit(X_train_vec, y_train)

# Generate soft targets using the teacher model's predictions
soft_targets_train = teacher_classifier.predict_proba(X_train_vec)
soft_targets_test = teacher_classifier.predict_proba(X_test_vec)

# Convert soft targets to one-dimensional class labels
y_train_labels = np.argmax(soft_targets_train, axis=1)
y_test_labels = np.argmax(soft_targets_test, axis=1)

# Initialize and train the student model with knowledge distillation
student_classifier = SVC(kernel='linear')
student_classifier.fit(X_train_vec, y_train_labels)

# Make predictions on the testing set
y_pred = student_classifier.predict(X_test_vec)

# Evaluate the classifier using soft targets labels
accuracy = accuracy_score(y_test_labels, y_pred)
print("Accuracy:", accuracy)



Accuracy: 1.0


In [391]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

# Identify whether the input text represents a mathematical expression. 
def is_math_expression(input_text):
    # Use a regular expression to identify mathematical expressions
    math_pattern = r'^\s*\d+(\s*[\+\-\*\=\/]\s*\d+)*\s*$'
    return bool(re.match(math_pattern, input_text))  
    
# Split the dataset into features and labels
X = df['Cleaned_Content']
y = df['Subject']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the cleaned text data
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Initialize and train the classifier
classifier = SVC(kernel='linear', probability = True)
classifier.fit(X_train_vec, y_train)

# Make predictions on the testing set
y_pred = classifier.predict(X_test_vec)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Function to preprocess new input content
def preprocess_input(input_text):
    # Perform text cleaning (define the clean_text function)
    cleaned_text = clean_text(input_text)  
    return vectorizer.transform([cleaned_text])

# Input content to predict subject
input_content = "amazon river is very deep"

# Preprocess the input content
input_vec = preprocess_input(input_content)
if is_math_expression(input_content):
    predicted_subject = "Mathematics"
else:
    # Predict the probabilities for each class
    probabilities = classifier.predict_proba(input_vec)

# Check if any probability exceeds a threshold
    threshold = 0.3  # Adjust as needed
    if np.max(probabilities) < threshold:
        predicted_subject = "Irrelevant"
    else:
    # Predict the subject with the highest probability
        predicted_subject = classifier.classes_[np.argmax(probabilities)]

print("Predicted Subject:", predicted_subject)


Accuracy: 0.967741935483871
Predicted Subject: Geography
