In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
file_path = "NLP/newdataset.csv"  # Ensure the correct path
try:
    df = pd.read_csv(file_path)
except Exception as e:
    print("Error loading file:", e)
    exit()

# Preprocess text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word.isalnum()]  # Remove special characters
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(tokens)

# Apply preprocessing
df["Processed Data"] = df["Data"].astype(str).apply(clean_text)

# Convert dataset text into TF-IDF vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Processed Data"])

# Function to check similarity with user input
def check_similarity(user_input):
    user_input_clean = clean_text(user_input)
    user_vector = vectorizer.transform([user_input_clean])
    similarities = cosine_similarity(user_vector, X).flatten()
    best_match_index = similarities.argmax()
    best_match_score = similarities[best_match_index]
    return df.iloc[best_match_index]["Data"], best_match_score

# Example usage
user_text = "Students must register for courses online."
matched_text, score = check_similarity(user_text)
print("Best Match:", matched_text)
print("Similarity Score:", score)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maanv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maanv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Error loading file: [Errno 2] No such file or directory: 'NLP/newdataset.csv'


NameError: name 'df' is not defined

: 