In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Load data into dataframe with column names
file_path = '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv'
df = pd.read_csv(file_path, encoding='latin1', usecols=[0, 5], names=['target', 'text'])

# Split the data into training and testing sets (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.1, random_state=42)

# Create a pipeline for text transformation and classification with the specified hyperparameters
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=None, ngram_range=(1,2))),
    ('clf', LogisticRegression(C=1, penalty='l2', solver='liblinear'))
])

# Train the classifier
pipeline.fit(X_train, y_train)

# You can now use the trained model (pipeline) to make predictions, evaluate on test set, etc.


In [2]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Evaluate the classifier using the pipeline
predictions_test = pipeline.predict(X_test)
print('Classification report:\n', classification_report(y_test, predictions_test))
print('Accuracy score:', accuracy_score(y_test, predictions_test))

# Plot the confusion matrix
confusion = confusion_matrix(y_test, predictions_test)

# Define labels for the confusion matrix
labels = np.array([['True Negative', 'False Positive'], ['False Negative', 'True Positive']])

# Define confusion matrix with labels
confusion_labelled = np.zeros_like(confusion, dtype=object)
for i in range(2):
    for j in range(2):
        confusion_labelled[i, j] = f'{labels[i, j]}\n{confusion[i, j]}'

# Print the text version of the confusion matrix
print("\nText Version of the Confusion Matrix:")
for i in range(2):
    row_text = []
    for j in range(2):
        row_text.append(f'{labels[i, j]}: {confusion[i, j]}')
    print(' | '.join(row_text))


Classification report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82     79812
           4       0.82      0.83      0.82     80188

    accuracy                           0.82    160000
   macro avg       0.82      0.82      0.82    160000
weighted avg       0.82      0.82      0.82    160000

Accuracy score: 0.8241875

Text Version of the Confusion Matrix:
True Negative: 65695 | False Positive: 14117
False Negative: 14013 | True Positive: 66175
