In [6]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from scipy.sparse import csr_matrix
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omkar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omkar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omkar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Load the dataset
data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
data.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Keep only relevant columns
data = data[['text', 'target']]

# Map target labels: 4 -> 1 (positive), 0 -> 0 (negative)
data['target'] = data['target'].map({4: 1, 0: 0})

# Reduce dataset size for faster execution
data_sampled = data.sample(n=100000, random_state=42)

In [8]:
# Optimized text preprocessing function
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])  # Remove stopwords
    return text

# Apply preprocessing
data_sampled['text'] = data_sampled['text'].apply(preprocess_text)


In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_sampled['text'], data_sampled['target'], test_size=0.2, random_state=42)

<pre>
Feature extraction:
Bag of Words (BoW)
Represents text as a vector of word frequencies.
Ignores word order and context.

</pre>

<pre>
Classification Approach: Logistic Regression
</pre>

In [10]:
# Initialize Bag of Words (BoW) Vectorizer with reduced features
bow_vectorizer = CountVectorizer(max_features=3000)

# Fit and transform the training data
X_train_bow = bow_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_bow = bow_vectorizer.transform(X_test)

# Convert to sparse matrices for efficiency
X_train_bow = csr_matrix(X_train_bow)
X_test_bow = csr_matrix(X_test_bow)

# Initialize the Logistic Regression model with a faster solver
model_bow = LogisticRegression(solver='saga', max_iter=100)

# Train the model with BoW features
model_bow.fit(X_train_bow, y_train)




In [11]:
# Make predictions using BoW features
y_pred_bow = model_bow.predict(X_test_bow)

# Calculate accuracy for BoW
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print(f'Accuracy (BoW): {accuracy_bow:.4f}')

# Classification report for BoW
print('Classification Report (BoW):')
print(classification_report(y_test, y_pred_bow))

# Confusion matrix for BoW
conf_matrix_bow = confusion_matrix(y_test, y_pred_bow)
print('Confusion Matrix (BoW):')
print(conf_matrix_bow)



Accuracy (BoW): 0.7470
Classification Report (BoW):
              precision    recall  f1-score   support

           0       0.76      0.71      0.74      9995
           1       0.73      0.78      0.76     10005

    accuracy                           0.75     20000
   macro avg       0.75      0.75      0.75     20000
weighted avg       0.75      0.75      0.75     20000

Confusion Matrix (BoW):
[[7125 2870]
 [2191 7814]]
