In [1]:
import re
import warnings
import numpy as np
import pandas as pd

from typing import Set, List
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from gensim.models.word2vec import Word2Vec
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

### Suppress warnings from the `ConvergenceWarning` category

In [2]:
warnings.filterwarnings('ignore', category=ConvergenceWarning)

### Load the dataset from `science.csv` into a DataFrame

In [3]:
df = pd.read_csv('data/science.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8695 entries, 0 to 8694
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Id       8695 non-null   object
 1   Comment  8695 non-null   object
 2   Topic    8695 non-null   object
dtypes: object(3)
memory usage: 203.9+ KB


### Display the first 10 rows of the DataFrame to get a quick overview of the data

In [4]:
df.head(10)

Unnamed: 0,Id,Comment,Topic
0,0x840,A few things. You might have negative- frequen...,Biology
1,0xbf0,Is it so hard to believe that there exist part...,Physics
2,0x1dfc,There are bees,Biology
3,0xc7e,I'm a medication technician. And that's alot o...,Biology
4,0xbba,Cesium is such a pretty metal.,Chemistry
5,0xb39,I meant that the question itself is unclear.,Chemistry
6,0x1f3d,Shove it up your ass and see what happens,Biology
7,0x531,"??? I mean it has some butter, but besides tha...",Chemistry
8,0xe05,https://t.me/joinchat/3gElLHLuMCxhNGI0,Biology
9,0x2148,"Well, that’s just the thing. You can’t really ...",Biology


### Count the occurrences of each unique value in the `Topic` column (labels)

In [5]:
df['Topic'].value_counts()

Topic
Biology      3591
Chemistry    2920
Physics      2184
Name: count, dtype: int64

### Define a function `preprocess_text` to clean and preprocess text data
#### Apply this function to the `Comment` column to remove punctuation, lowercase all text, and filter out stopwords

In [6]:
def preprocess_text(text: str, stop_words: Set[str]) -> str:
    text = re.sub(r'[^a-zA-Z\s]', '', text, flags=re.IGNORECASE).lower().strip()
    tokens = wordpunct_tokenize(text)
    filtered_tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(filtered_tokens)

stoplist = set(stopwords.words('english'))
df['Comment'] = df['Comment'].apply(lambda doc: preprocess_text(doc, stoplist))
df.head(10)

Unnamed: 0,Id,Comment,Topic
0,0x840,things might negative frequency dependent sele...,Biology
1,0xbf0,hard believe exist particulars cant detect any...,Physics
2,0x1dfc,bees,Biology
3,0xc7e,im medication technician thats alot drugs live...,Biology
4,0xbba,cesium pretty metal,Chemistry
5,0xb39,meant question unclear,Chemistry
6,0x1f3d,shove ass see happens,Biology
7,0x531,mean butter besides sugar baking soda peanuts ...,Chemistry
8,0xe05,httpstmejoinchatgellhlumcxhngi,Biology
9,0x2148,well thats thing cant really induce immune res...,Biology


### Remove rows where the `Comment` column is empty after preprocessing, and drop any rows with missing values

In [7]:
df = df[~(df['Comment'].str.strip() == '')]
df.dropna(inplace=True)
df.shape[0]

8642

### Encode the `Topic` column with numerical labels and convert the `Comment` column into a list of words

In [8]:
label_encoder = LabelEncoder()
df['Topic'] = label_encoder.fit_transform(df['Topic'])
df['Comment'] = df['Comment'].apply(lambda x: x.split())

### Split the dataset into training and testing sets with an 80-20 split for model evaluation

In [9]:
documents, labels = df['Comment'], df['Topic']
X_train, X_test, y_train, y_test = train_test_split(documents, labels, test_size=0.2, random_state=42)

### Set parameters for training a Word2Vec model and train it on the comments from the training set

In [10]:
w2v_params = {
    'vector_size': 1000,
    'window': 100,
    'min_count': 2,
    'sg': 1,
    'sample': 1e-3,
    'workers': 8,
}

w2v_model = Word2Vec(X_train, **w2v_params)

### Define a function `document_vectorizer` to convert lists of words into fixed-size vectors by averaging the Word2Vec vectors of the words in a document
#### Apply this function to both the training and testing sets

In [11]:
def document_vectorizer(corpus: List[List[str]], model: Word2Vec, num_features: int) -> np.ndarray:
    vocab = set(model.wv.index_to_key)
    
    def mean_words_vectors(words: List[str], model: Word2Vec, vocab: Set[str], num_features: int) -> np.ndarray:
        words = [word for word in words if word in vocab]
        if not words:
            return np.zeros((num_features,), dtype="float32")
        
        feature_vectors = np.array([model.wv[word] for word in words])
        feature_vector = feature_vectors.mean(axis=0)
        return feature_vector

    features = np.array([mean_words_vectors(s, model, vocab, num_features) for s in corpus])
    return features

mean_train_features = document_vectorizer(X_train, w2v_model, 1000)
mean_test_features = document_vectorizer(X_test, w2v_model, 1000)

print(f'Word2Vec model:\n\
Train features shape: {mean_train_features.shape}\n\
Test features shape: {mean_test_features.shape}')

Word2Vec model:
Train features shape: (6913, 1000)
Test features shape: (1729, 1000)


### Train a logistic regression classifier on the vectorized training data and evaluate its accuracy on the test set

In [12]:
lr_classifier = LogisticRegression()
lr_classifier.fit(mean_train_features, y_train)
lr_pred = lr_classifier.predict(mean_test_features)

print(f'Unoptimized Logistic Regression accuracy: {accuracy_score(y_test, lr_pred)}')

Unoptimized Logistic Regression accuracy: 0.626951995373048


### Train a random forest classifier on the vectorized training data and evaluate its accuracy on the test set

In [13]:
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(mean_train_features, y_train)
rf_pred = rf_classifier.predict(mean_test_features)

print(f'Unoptimized Random Forest Classifier accuracy: {accuracy_score(y_test, rf_pred)}')

Unoptimized Random Forest Classifier accuracy: 0.6674378253325621


### Perform a grid search to find the best hyperparameters for the logistic regression model based on accuracy, using cross-validation

In [14]:
param_grid = {
    'C': np.logspace(-3,3,7),
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
}

grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, n_jobs=-1, cv=5, scoring='accuracy')
grid_search.fit(mean_train_features, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters: {'C': 1000.0, 'solver': 'newton-cg'}
Best accuracy: 0.6625200778943822


### Perform a grid search to find the best hyperparameters for the random forest classifier based on accuracy, using cross-validation

In [15]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2'],
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, n_jobs=-1, cv=5, scoring='accuracy')
grid_search.fit(mean_train_features, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)

Best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
Best accuracy: 0.6726463475759508
