In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay)
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt

In [6]:
#loading training data
train=pd.read_csv('./database/training.csv')
test=pd.read_csv('./database/test.csv')
validate=pd.read_csv('./database/validation.csv')

In [7]:
# test data validity 
print('Dataset information:')
print(f'Training data: {train.shape}')
print(f'Validation data: {validate.shape}')
print(f'Test data: {test.shape}')

Dataset information:
Training data: (16000, 2)
Validation data: (2000, 2)
Test data: (2000, 2)


In [9]:
x=train['text']
y=train['label']

# Preprocessing

In [13]:
tfidf= TfidfVectorizer(stop_words='english', max_features=5000)
x_vectorized=tfidf.fit_transform(x)
x_vectorized

<16000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 118635 stored elements in Compressed Sparse Row format>

In [22]:
x_train, x_temp, y_train, y_temp= train_test_split(x_vectorized, y, test_size=0.3, random_state=42, stratify=y)
x_val, x_test, y_val, y_test= train_test_split(x_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
print(f'x_train: {x_train.shape}')
print(f'x_temp: {x_temp.shape}')
print(f'x_val: {x_val.shape}')
print(f'x_test: {x_test.shape}')
print(f'y_train: {y_train.shape}')
print(f'y_temp: {y_temp.shape}')
print(f'y_val: {y_val.shape}')
print(f'y_test: {y_test.shape}')

x_train: (11200, 5000)
x_temp: (4800, 5000)
x_val: (2400, 5000)
x_test: (2400, 5000)
y_train: (11200,)
y_temp: (4800,)
y_val: (2400,)
y_test: (2400,)


In [19]:
class_weights= compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict= {i: class_weights[i] for i in range(len(class_weights))}
class_weight_dict

{0: 0.5715452133088386,
 1: 0.4972473805718345,
 2: 2.0445418035779483,
 3: 1.2353849547760865,
 4: 1.376597836774828,
 5: 4.666666666666667}

# Model Implementation

In [23]:
log_reg= LogisticRegression(class_weight=class_weight_dict, max_iter=1000, random_state=42)
log_reg.fit(x_train, y_train)