In [None]:
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Load data
k_labeled_df = pd.read_csv('/content/drive/MyDrive/labeled_k_comments.csv')
k_unlabeled_df = pd.read_csv('/content/drive/MyDrive/kendrick_comment_data.csv')

In [None]:
# Rename columns for consistency
k_labeled_df = k_labeled_df.rename(columns={'Comments': 'comment'})

In [None]:
# Merge labeled and unlabeled dataframes
merged_df = k_labeled_df.merge(k_unlabeled_df, on='comment', how='right')

In [None]:
# Split into labeled and unlabeled datasets
labeled_df = merged_df[merged_df['label'].notna()].drop_duplicates(subset='comment', keep='first')
unlabeled_df = merged_df[merged_df['label'].isna()].drop_duplicates(subset='comment', keep='first')

In [None]:
# Ensure column order consistency
column_order = ['video_id', 'comment_id', 'comment', 'like_count', 'reply_count', 'published_at', 'label', 'confidence score']
merged_df = merged_df[column_order]
unlabeled_df = unlabeled_df[column_order]
labeled_df = labeled_df[column_order]

In [None]:
# Save unlabeled data for future labeling
unlabeled_df.to_csv('/content/drive/MyDrive/unlabeled_kendrick_comments.csv', index=False)

In [None]:
# Function to train the model
def train_model(df):
    label_encoder = LabelEncoder()
    df['encoded_label'] = label_encoder.fit_transform(df['label'])

    X_train = df['comment']
    y_train = df['encoded_label']

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
        ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
    ])

    pipeline.fit(X_train, y_train)
    return pipeline, label_encoder

In [None]:
# Function to predict and filter low-confidence predictions
def predict_and_filter(df, pipeline, label_encoder):
    df = df.dropna(subset=['comment'])
    pred_probs = pipeline.predict_proba(df['comment'])

    max_confidence = pred_probs.max(axis=1)
    pred_labels = pred_probs.argmax(axis=1)

    df['predicted_label'] = label_encoder.inverse_transform(pred_labels)
    df['predicted_confidence'] = max_confidence

    return df

In [None]:
# Train the model with labeled data
pipeline, label_encoder = train_model(labeled_df)

In [None]:
# Predict on unlabeled data
unlabeled_df = predict_and_filter(unlabeled_df, pipeline, label_encoder)

In [None]:
# Get low-confidence predictions
low_conf_df = unlabeled_df[unlabeled_df['predicted_confidence'] < 0.7].sort_values(by='predicted_confidence')

In [None]:
# Save the low-confidence predictions for manual labeling
low_conf_df[['comment', 'predicted_label', 'predicted_confidence']].head(200).to_csv(
    "/content/drive/MyDrive/k_to_label_batch.csv", index=False
)

#### Round 2 Training

In [None]:
# Load new labeled batch
labeled_batch = pd.read_csv("/content/drive/MyDrive/new_labeled_batch.csv")

In [None]:
# Add to labeled data and remove labeled rows from unlabeled data
labeled_df = pd.concat([labeled_df, labeled_batch], ignore_index=True)
unlabeled_df = unlabeled_df[~unlabeled_df['comment'].isin(labeled_batch['comment'])]

In [None]:
# Re-encode labels and retrain
labeled_df['encoded_label'] = label_encoder.fit_transform(labeled_df['label'])
pipeline.fit(labeled_df['comment'], labeled_df['encoded_label'])

In [None]:
# Predict and filter new unlabeled data
unlabeled_df = predict_and_filter(unlabeled_df, pipeline, label_encoder)

#### Visualizations

In [None]:
# Plot predicted sentiment distribution
unlabeled_df['predicted_label'].value_counts().plot(
    kind='bar', title='Predicted Sentiment Distribution', xlabel='Sentiment', ylabel='Count', color='skyblue', edgecolor='black'
)
plt.grid(axis='y')
plt.show()

In [None]:
# Confidence by predicted sentiment
plt.figure(figsize=(10, 6))
sns.boxplot(x='predicted_label', y='predicted_confidence', data=unlabeled_df)
plt.title('Confidence by Predicted Sentiment')
plt.xlabel('Predicted Sentiment')
plt.ylabel('Confidence Score')
plt.grid(True)
plt.show()

In [None]:
# t-SNE visualization
tfidf = pipeline.named_steps['tfidf']
X_features = tfidf.transform(unlabeled_df['comment'])
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_embedded = tsne.fit_transform(X_features.toarray())

unlabeled_df['tsne_1'] = X_embedded[:, 0]
unlabeled_df['tsne_2'] = X_embedded[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(data=unlabeled_df, x='tsne_1', y='tsne_2', hue='predicted_label', alpha=0.7, palette='Set2')
plt.title('t-SNE Visualization of Predicted Sentiments')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend(title='Predicted Sentiment')
plt.grid(True)
plt.show()

In [None]:
# Save final labeled data
labeled_df.to_csv('/content/drive/MyDrive/round2_labeled_comments.csv', index=False)

In [None]:
# Export low-confidence predictions for the second round of manual labeling
low_conf_df_2 = unlabeled_df[unlabeled_df['predicted_confidence'] < 0.7].sort_values(by='predicted_confidence')
low_conf_df_2[['comment', 'predicted_label', 'predicted_confidence']].head(200).to_csv(
    "/content/drive/MyDrive/round2_to_label_batch.csv", index=False
)