In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score

# Part 1: Load and preprocess the data
data = pd.read_csv("data/Corona_NLP_train.csv", encoding='latin1')
data = data[data['Sentiment'] != 'Neutral']  # Remove Neutral tweets

# Map sentiment to binary values
data['Sentiment'] = data['Sentiment'].map({'Positive': 1, 'Extremely Positive': 1, 'Negative': 0, 'Extremely Negative': 0})

# Extract X and Y
X = data['OriginalTweet'].values
Y = data['Sentiment'].values

# Split data (train 60%, test 15%, validation 25%)
train_size = int(0.6 * len(X))
test_size = int(0.15 * len(X))
valid_size = len(X) - train_size - test_size

X_train, X_test, X_valid = X[:train_size], X[train_size:train_size+test_size], X[train_size+test_size:]
Y_train, Y_test, Y_valid = Y[:train_size], Y[train_size:train_size+test_size], Y[train_size+test_size:]

# Part 2: Create and train the pipeline model
model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000))
])

model.fit(X_train, Y_train)

# Part 3: Evaluate precision and recall
Y_pred_test = model.predict(X_test)
precision_0 = precision_score(Y_test, Y_pred_test, pos_label=0)
precision_1 = precision_score(Y_test, Y_pred_test, pos_label=1)
recall_0 = recall_score(Y_test, Y_pred_test, pos_label=0)
recall_1 = recall_score(Y_test, Y_pred_test, pos_label=1)

# Part 4: Define cost function
def cost(model, threshold, X, Y):
    probs = model.predict_proba(X)[:, 1]
    predictions = (probs >= threshold).astype(int)
    cost_values = (1 - predictions) * Y + 5 * (1 - Y) * predictions
    return np.mean(cost_values)

# Part 5: Find optimal threshold
thresholds = np.linspace(0, 1, 101)
costs = [cost(model, t, X_test, Y_test) for t in thresholds]
optimal_threshold = thresholds[np.argmin(costs)]
cost_at_optimal_threshold = min(costs)

# Part 6: Evaluate cost and confidence interval on validation set
cost_at_optimal_threshold_valid = cost(model, optimal_threshold, X_valid, Y_valid)
n_valid = len(X_valid)
epsilon = np.sqrt(np.log(1 / 0.01) / (2 * n_valid))
cost_interval_valid = (
    cost_at_optimal_threshold_valid - epsilon,
    cost_at_optimal_threshold_valid + epsilon
)

# Part 7: Compute empirical variance and confidence interval using Bennett's inequality
probs_valid = model.predict_proba(X_valid)[:, 1]
predictions_valid = (probs_valid >= optimal_threshold).astype(int)
cost_values = (1 - predictions_valid) * Y_valid + 5 * (1 - Y_valid) * predictions_valid
variance_of_C = np.var(cost_values)
delta = np.sqrt((2 * variance_of_C * np.log(1 / 0.01)) / n_valid) + (7 * np.log(1 / 0.01) / (3 * n_valid))
interval_of_C = (
    cost_at_optimal_threshold_valid - delta,
    cost_at_optimal_threshold_valid + delta
)

# Results
results = {
    "precision_0": precision_0,
    "precision_1": precision_1,
    "recall_0": recall_0,
    "recall_1": recall_1,
    "optimal_threshold": optimal_threshold,
    "cost_at_optimal_threshold": cost_at_optimal_threshold,
    "cost_at_optimal_threshold_valid": cost_at_optimal_threshold_valid,
    "cost_interval_valid": cost_interval_valid,
    "variance_of_C": variance_of_C,
    "interval_of_C": interval_of_C,
}

for key, value in results.items():
    print(f"{key}: {value}")


Explanation:
Data Loading and Preprocessing:

Removed rows with Neutral sentiment.
Mapped positive sentiments to 1 and negative sentiments to 0.
Split the data into train, test, and validation sets in the required proportions.
Pipeline Creation:

Combined CountVectorizer and LogisticRegression into a single Pipeline for streamlined training and inference.
Precision and Recall Calculation:

Used precision_score and recall_score from sklearn to evaluate model performance on the test set.
Cost Function Implementation:

Computed the average cost for a given threshold using the predicted probabilities.
Optimal Threshold Selection:

Evaluated the cost over a range of thresholds and selected the one minimizing the cost.
Validation Cost and Confidence Interval:

Applied Hoeffding's inequality for the confidence interval.
Empirical Variance and Bennett's Confidence Interval:

Calculated variance and refined the confidence interval using Bennett's inequality.
This solution adheres to the problem requirements and includes all steps for evaluation and optimization.