<a href="https://colab.research.google.com/github/monkrus/NLP-text-classification/blob/main/Query_interpretation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Import libraries
import csv
import pandas as pd
import string
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Load dataset
try:
    data = pd.read_csv('/content/sample_data/Data_even.csv')
except FileNotFoundError:
    print("The file 'Data.csv' was not found.")
    exit()

for column in ['query', 'intent']:
    if column not in data.columns:
        print(f"The expected column '{column}' is not in the CSV file.")
        exit()

queries = data['query']
intents = data['intent']

In [14]:
# Text preprocessing
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

def preprocess_text(text):
  text = text.lower()
  text = "".join([char for char in text if char not in string.punctuation])
  words = word_tokenize(text)

  # Added check for empty string
  if not words:
    return ""

  words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords]
  return " ".join(words)

queries = queries.apply(preprocess_text)

In [15]:
# Label encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(intents)

In [16]:

# Check class distribution
class_counts = pd.Series(y).value_counts()
min_samples_required = 5

if any(class_counts < min_samples_required):

  print("Warning: Some classes have too few samples for StratifiedKFold.")
  print(f"Minimum samples required per class: {min_samples_required}")

  low_samples = [cls for cls, cnt in class_counts.items() if cnt < min_samples_required]

  print("Classes with less than", min_samples_required, "samples:")
  print(low_samples)


In [17]:
# Create StratifiedKFold object with 5 folds
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [18]:
# Define pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression(solver='liblinear'))
])

In [19]:
# Define GridSearchCV parameters
param_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1, 10],
}

In [20]:
# Create GridSearchCV object with StratifiedKFold
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=stratified_kfold)


In [21]:
# Fit GridSearchCV with data
grid_search.fit(queries, y)


In [22]:
# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'clf__C': 10, 'vect__ngram_range': (1, 1)}
Best score: 0.5347826086956522
