In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

def preprocess_text(text):
    
    tokens = word_tokenize(text.lower())
    
    stop_words = set(stopwords.words('english'))
    
    punctuation = set(string.punctuation)
    
    filtered_tokens = [word for word in tokens if word not in stop_words and word not in punctuation]
    
    preprocessed_text = ' '.join(filtered_tokens)
    
    return preprocessed_text


input_file_path = r'C:\Users\HP\Desktop\train_data.xlsx'
output_file_path = r'C:\Users\HP\Desktop\preprocessed_data.xlsx'

df = pd.read_excel(input_file_path)

df['text'] = df['text'].apply(preprocess_text)

# Set index from 1 to 10
df.index = range(1, 11)

df.to_excel(output_file_path, index=True)

print("Text preprocessing completed and saved to the new Excel file:", output_file_path)


Text preprocessing completed and saved to the new Excel file: C:\Users\HP\Desktop\preprocessed_data.xlsx


In [2]:
resume_data =  pd.read_excel(r"C:\Users\HP\Desktop\preprocessed_data.xlsx")
resume_data.head()

Unnamed: 0.1,Unnamed: 0,text,index
0,1,nice useful training,1
1,2,good chance,2
2,3,trained municipality hebron,3
3,4,trained one department hebron municipality,4
4,5,training session learned effective communicati...,5


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

# Assuming 'label' is your target label column
text = resume_data["text"].values
labels = resume_data["index"].values


# Splitting the data into training and testing sets
text_train, text_test, y_train, y_test = train_test_split(text, labels, random_state=0, test_size=0.25)

# TF-IDF Vectorization
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,  # Reduce the impact of high word frequencies
    use_idf=True,       # Give more importance to rare words in documents
    stop_words='english',   # Remove words that don't carry much meaning
    max_features=1000    # Maximum number of features (words) used in the matrix
)

X_train = word_vectorizer.fit_transform(text_train)
X_test = word_vectorizer.transform(text_test)


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, classification_report, precision_score, recall_score

model_NB = MultinomialNB().fit(X_train, y_train)
prediction_mnb = model_NB.predict(X_test)

print('Accuracy of MultinomialNB Classifier on training set: {:.2f}'.format(model_NB.score(X_train, y_train)))
print('Accuracy of MultinomialNB Classifier on test set: {:.2f}'.format(model_NB.score(X_test, y_test)))

Accuracy of MultinomialNB Classifier on training set: 1.00
Accuracy of MultinomialNB Classifier on test set: 0.00
