In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split


In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional: For better lemmatization
nltk.download('averaged_perceptron_tagger')  # Required for some tokenizers


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
file_path = '/content/output_chunk_4.csv'

df = pd.read_csv(file_path)



In [4]:
print(df.head())

  category                                               text
0     arts  lewis capaldi is a fan of the vamps. the someo...
1     arts  on april 14 2016 exactly one week before he di...
2     arts  winston marshall is leaving mumford sons. wins...
3     arts  jones entertainment group president brad jones...
4     arts  vancouver bc cnw nickel rock resources inc. th...


In [5]:
# Split into features (X) and labels (y)
X = df["text"]  # The text column
y = df["category"]  # The label column

# Split into Training (80%) and Temporary set (20%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Now, split the Temporary set into Validation (18%) and Test (2%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.1, random_state=42, stratify=y_temp
)  # 10% of the 20% temp = 2% test


In [6]:
!rm -rf /root/nltk_data
import nltk
nltk.download('all')  # This downloads all required datasets


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [7]:
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# Text Preprocessing Function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a string
    return " ".join(tokens)

# Apply preprocessing to the text column for train set
X_train_preprocessed = X_train.apply(preprocess_text)


# for validation set
X_val_preprocessed = X_val.apply(preprocess_text)

# for test set
#test_df.loc[:, "clean_text"] = test_df["text"].apply(preprocess_text)

# Show sample results
#print(train_df[["category", "text", "clean_text"]].head())



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features
#tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), sublinear_tf=True)

# Fit and transform on the training data
tfidf_train = tfidf_vectorizer.fit_transform(X_train_preprocessed)

tfidf_val = tfidf_vectorizer.transform(X_val_preprocessed)

joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")



In [None]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Train and evaluate SVM
svm = LinearSVC(random_state=42)
svm.fit(tfidf_train, y_train)  # Use tfidf_train for training
y_pred_svm = svm.predict(tfidf_val)  # Use tfidf_val for prediction
acc_svm = accuracy_score(y_val, y_pred_svm)  # Compare with y_val
print(f'SVM accuracy: {acc_svm:.2f}')

# Train and evaluate MNB
mnb = MultinomialNB()
mnb.fit(tfidf_train, y_train)  # Use tfidf_train for training
y_pred_mnb = mnb.predict(tfidf_val)  # Use tfidf_val for prediction
acc_mnb = accuracy_score(y_val, y_pred_mnb)  # Compare with y_val
print(f'MNB accuracy: {acc_mnb:.2f}')

# Train and evaluate Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(tfidf_train, y_train)  # Use tfidf_train for training
y_pred_rf = rf.predict(tfidf_val)  # Use tfidf_val for prediction
acc_rf = accuracy_score(y_val, y_pred_rf)  # Compare with y_val
print(f'Random Forest accuracy: {acc_rf:.2f}')

# Train and evaluate KNN
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(tfidf_train, y_train)  # Use tfidf_train for training
y_pred_knn = knn.predict(tfidf_val)  # Use tfidf_val for prediction
acc_knn = accuracy_score(y_val, y_pred_knn)  # Compare with y_val
print(f'KNN accuracy: {acc_knn:.2f}')


SVM accuracy: 0.81
MNB accuracy: 0.68
Random Forest accuracy: 0.77
KNN accuracy: 0.66


In [None]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import joblib  # Added for saving models

# Train and evaluate SVM
svm = LinearSVC(random_state=42)
svm.fit(tfidf_train, y_train)
y_pred_svm = svm.predict(tfidf_val)
print("SVM Classification Report:\n", classification_report(y_val, y_pred_svm))  # Changed line
joblib.dump(svm, "svm_model.pkl")  # Changed line

# Train and evaluate MNB
mnb = MultinomialNB()
mnb.fit(tfidf_train, y_train)
y_pred_mnb = mnb.predict(tfidf_val)
print("MNB Classification Report:\n", classification_report(y_val, y_pred_mnb))  # Changed line
joblib.dump(mnb, "mnb_model.pkl")  # Changed line

# Train and evaluate Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(tfidf_train, y_train)
y_pred_rf = rf.predict(tfidf_val)
print("Random Forest Classification Report:\n", classification_report(y_val, y_pred_rf))  # Changed line
joblib.dump(rf, "rf_model.pkl")  # Changed line

# Train and evaluate KNN
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(tfidf_train, y_train)
y_pred_knn = knn.predict(tfidf_val)
print("KNN Classification Report:\n", classification_report(y_val, y_pred_knn))  # Changed line
joblib.dump(knn, "knn_model.pkl")  # Changed line


SVM Classification Report:
                precision    recall  f1-score   support

         arts       0.79      0.86      0.83        72
        crime       0.81      0.89      0.85        72
     disaster       0.78      0.86      0.82        72
      economy       0.76      0.79      0.78        72
    education       0.93      0.93      0.93        72
environmental       0.94      0.90      0.92        72
       health       0.87      0.90      0.88        72
humanInterest       0.77      0.78      0.77        72
       labour       0.85      0.83      0.84        72
    lifestyle       0.79      0.85      0.82        72
        other       0.30      0.18      0.22        72
     politics       0.96      0.89      0.92        72
     religion       0.68      0.56      0.61        72
      science       0.84      0.92      0.87        72
       social       0.92      0.79      0.85        72
        sport       0.78      0.92      0.84        72
       unrest       0.77      0.79  

['knn_model.pkl']

In [None]:
pip install langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/981.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m645.1/981.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=6d8e6ed3d0cfca840e9d952a88cc3480003f1dd5e6ab3e442f52549b0209186f
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3

In [None]:
print(df.head())

  category                                               text
0     arts  lewis capaldi is a fan of the vamps. the someo...
1     arts  on april 14 2016 exactly one week before he di...
2     arts  winston marshall is leaving mumford sons. wins...
3     arts  jones entertainment group president brad jones...
4     arts  vancouver bc cnw nickel rock resources inc. th...


In [None]:
from langdetect import detect
import pandas as pd


# Function to detect non-English text
def is_non_english(text):
    try:
        return detect(text) != 'en'  # Check if detected language is not English
    except:
        return True  # If detection fails, assume non-English

# Apply function to check non-English rows
df['is_non_english'] = df['text'].apply(is_non_english)

# Filter out non-English rows
non_english_df = df[df['is_non_english']]

print(non_english_df)


           category                                               text  \
1418        economy  karachi june 07 2021 ppi ot . name of the fund...   
2973  humanInterest  bc eu britain diana leg 1103 . diana legacy li...   

      is_non_english  
1418            True  
2973            True  


In [None]:
from sklearn.svm import SVC

svm_rbf = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_rbf.fit(tfidf_train, y_train)
y_pred_svm_rbf = svm_rbf.predict(tfidf_val)
acc_svm_rbf = accuracy_score(y_val, y_pred_svm_rbf)
print(f'SVM (RBF kernel) accuracy: {acc_svm_rbf:.2f}')


SVM (RBF kernel) accuracy: 0.77


In [None]:
svm_poly = SVC(kernel='poly', degree=2, C=1.0, random_state=42)
svm_poly.fit(tfidf_train, y_train)
y_pred_svm_poly = svm_poly.predict(tfidf_val)
acc_svm_poly = accuracy_score(y_val, y_pred_svm_poly)
print(f'SVM (Polynomial kernel) accuracy: {acc_svm_poly:.2f}')


SVM (Polynomial kernel) accuracy: 0.71


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01, 0.1],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(tfidf_train, y_train)

print("Best parameters:", grid_search.best_params_)
best_svm = grid_search.best_estimator_

y_pred_best_svm = best_svm.predict(tfidf_val)
acc_best_svm = accuracy_score(y_val, y_pred_best_svm)
print(f'Best SVM accuracy: {acc_best_svm:.2f}')


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best SVM accuracy: 0.78
