In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import re

# Step 1: Load and preprocess data
# Note: Adjust the file path as necessary
print("Step 1: load data")
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv('data/sentiment140.csv', encoding='latin-1', names=column_names)
df = df[['text', 'target']]
df['target'] = df['target'].map({0: 'negative', 2: 'neutral', 4: 'positive'})

def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['text'] = df['text'].apply(preprocess_text)

# Step 2: Split the data
print("Step 2: split data")
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

# Step 3: Feature Extraction and Kernel Approximation
print("Step 3: feature extraction")
tfidf = TfidfVectorizer(max_features=10000)  # You can adjust max_features according to your needs
rbf_feature = RBFSampler(gamma=1, random_state=1)

# Step 4: Model Training
print("Step 4: model training")
sgd = SGDClassifier(max_iter=1000, tol=1e-3)

pipeline = Pipeline([
    ('tfidf', tfidf),
    ('rbf_feature', rbf_feature),
    ('scaler', StandardScaler(with_mean=False)),  # StandardScaler to scale RBF features
    ('sgd', sgd)
])

pipeline.fit(X_train, y_train)

# Step 5: Evaluation
print("Step 5: eval")
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


starting
Accuracy: 0.557090625
              precision    recall  f1-score   support

    negative       0.55      0.63      0.59    159494
    positive       0.57      0.48      0.52    160506

    accuracy                           0.56    320000
   macro avg       0.56      0.56      0.55    320000
weighted avg       0.56      0.56      0.55    320000

