In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

# Load the data
col_names = ['sentiment', 'id', 'date', 'query', 'user', 'text']
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='latin-1', names=col_names)

# Pre-process the data
df = df[['text', 'sentiment']]


In [27]:
X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['sentiment'], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [28]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)


In [29]:
model = LogisticRegression(random_state=42)
param_grid = {'C': [0.1, 1, 5], 'max_iter': [2000]}  # Adjusted max_iter values
grid = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
grid.fit(X_train_vec, y_train)

In [31]:
print(f"Best parameters: {grid.best_params_}")
best_model = grid.best_estimator_

Best parameters: {'C': 1, 'max_iter': 2000}


In [32]:
y_pred = best_model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79    159790
           4       0.79      0.80      0.79    160210

    accuracy                           0.79    320000
   macro avg       0.79      0.79      0.79    320000
weighted avg       0.79      0.79      0.79    320000

