In [5]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m704.2 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.29.2-py3-none-any.whl (468 kB)
Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl (418 kB)
Downloading tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m709.3 kB/s[0m eta [3

In [11]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.8.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.12-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downl

In [15]:
!python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m632.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:17[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [27]:
!pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import lightgbm as lgb
import joblib
import time
import os
from tqdm import tqdm
import re

start_time = time.time()

# --- Model ID Generator ---
def get_next_model_id(base_name='model', extension='.pkl'):
    n = 1
    while os.path.exists(f"{base_name}{n}{extension}"):
        n += 1
    return n

model_id = get_next_model_id()
model_filename = f"model{model_id}.pkl"
tfidf_filename = f"tfidf{model_id}.pkl"

# --- Robust CSV Loading ---
def load_csv_robustly(file_path):
    expected_columns = ['tweet_id', 'author_id', 'inbound', 'created_at', 'text', 'response_tweet_id', 'in_response_to_tweet_id']
    try:
        df = pd.read_csv(file_path, sep=',', engine='python', on_bad_lines='warn', quotechar='"')
        print("Loaded CSV with comma separator.")
    except Exception as e:
        print(f"Comma-separated parsing failed: {e}")
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()
        header = lines[0].strip().split(',')
        data = [line.strip().split(',', 6) for line in lines[1:]]
        df = pd.DataFrame(data, columns=header[:7] if len(header) >= 7 else header + [''] * (7 - len(header)))
        print("Loaded CSV manually splitting by commas.")

    df.columns = [col.strip() for col in df.columns]
    for col in expected_columns:
        if col not in df.columns:
            df[col] = '' if col in ['text', 'response_tweet_id'] else -1 if col == 'in_response_to_tweet_id' else np.nan
    df = df[expected_columns]
    return df

df = load_csv_robustly('twcs.csv')
print(f"Loading time: {time.time() - start_time:.2f} seconds")

# --- Data Regularization ---
def regularize_data(df):
    start = time.time()
    df['tweet_id'] = pd.to_numeric(df['tweet_id'], errors='coerce')
    df['author_id'] = pd.to_numeric(df['author_id'], errors='coerce')
    df['in_response_to_tweet_id'] = pd.to_numeric(df['in_response_to_tweet_id'], errors='coerce', downcast='integer')
    df['response_tweet_id'] = df['response_tweet_id'].fillna('')
    df['in_response_to_tweet_id'] = df['in_response_to_tweet_id'].fillna(-1)

    def parse_response_ids(x):
        if pd.isna(x) or x == '':
            return []
        try:
            return [int(x.strip())] if str(x).strip().isdigit() else []
        except:
            return []

    df['response_tweet_id'] = df['response_tweet_id'].apply(parse_response_ids)
    df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce', utc=True)
    df = df.sort_values('created_at', na_position='first').reset_index(drop=True)

    def preprocess_text(text):
        text = str(text).lower()
        text = re.sub(r'http\S+|www\S+', '', text)
        text = re.sub(r'[\n\r\t]', ' ', text)
        text = re.sub(r'[^\w\s@😡😂😢]', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    df['cleaned_text'] = df['text'].apply(preprocess_text)

    def extract_mentions(text):
        words = text.split()
        mentions = [word for word in words if word.startswith('@') and len(word) > 1]
        return mentions if mentions else []

    df['mentioned_companies'] = df['text'].apply(extract_mentions)
    df['company_mention_count'] = df['mentioned_companies'].apply(len)

    df = df.drop_duplicates(subset=['tweet_id', 'author_id', 'text'], keep='first')
    df = df[df['tweet_id'].notna() & df['author_id'].notna()]

    def validate_response(row):
        if pd.isna(row['in_response_to_tweet_id']) or row['in_response_to_tweet_id'] == -1:
            return True
        return row['in_response_to_tweet_id'] in df['tweet_id'].values

    df['is_conversation_valid'] = df.apply(validate_response, axis=1)
    print(f"Rows with invalid conversation references (treated as standalone): {len(df[~df['is_conversation_valid']])}")
    print(f"Regularization time: {time.time() - start:.2f} seconds")
    return df

df = regularize_data(df)

# --- Enhanced NLP Features ---
start = time.time()
positive_words = ['good', 'great', 'awesome', 'happy', 'love', 'excellent', 'best', 'thanks', 'amazing', 'perfect', 'fixed', 'helpful', 'fast', 'nice', 'appreciate', 'cool', 'sweet', 'yay']
negative_words = ['bad', 'terrible', 'awful', 'hate', 'worst', 'poor', 'sucks', 'slow', 'broken', 'disgrace', 'annoying', 'fail', 'horrible', 'issue', 'problem', 'disappointing', 'trash', 'shitty', 'damn', 'wtf']

def get_sentiment(text):
    words = set(re.findall(r'\w+', text.lower()))
    pos_count = sum(1 for word in words if word in positive_words) + ('😂' in text) * 1
    neg_count = sum(1 for word in words if word in negative_words) + ('😡' in text) * 1 + ('😢' in text) * 1
    score = (pos_count - neg_count) / max(len(words) + 1, 1)
    category = 'Positive' if score > 0.03 else 'Negative' if score < -0.03 else 'Neutral'
    return score, category

df[['sentiment_score', 'sentiment_category']] = df['text'].apply(lambda x: pd.Series(get_sentiment(x)))

toxic_keywords = ['hate', 'stupid', 'idiot', 'worst', 'terrible', 'awful', 'sucks', 'damn', 'hell', 'wtf', 'fuck', 'shit', 'ass', 'pissed', 'bullshit', 'crap', 'trash', 'jerk', 'fucking']

def get_toxicity(text):
    words = set(re.findall(r'\w+', text.lower()))
    toxic_count = sum(1 for word in words if word in toxic_keywords) + ('😡' in text) * 1
    return min(toxic_count / 5.0, 1.0)

df['toxicity_level'] = df['text'].apply(get_toxicity)

emotion_keywords = {
    'anger': ['angry', 'mad', 'furious', 'hate', 'damn', 'wtf', 'fuck', 'annoying', 'pissed', 'outrage', 'frustrated', 'sucks'],
    'sadness': ['sad', 'unhappy', 'sorry', 'terrible', 'poor', 'disappointed', 'upset', 'pain', 'hurt'],
    'joy': ['happy', 'great', 'awesome', 'love', 'thanks', 'amazing', 'wonderful', 'cool', 'sweet'],
    'neutral': []
}

def get_emotion(text):
    words = set(re.findall(r'\w+', text.lower()))
    if '😡' in text:
        return 'anger', 0.5
    if '😢' in text:
        return 'sadness', 0.5
    if '😂' in text:
        return 'joy', 0.5
    for emotion, keywords in emotion_keywords.items():
        score = sum(1 for word in words if word in keywords) / max(len(words) + 1, 1)
        if score > 0.03:
            return emotion, score
    return 'neutral', 0.03

df[['emotion_label', 'emotion_score']] = df['text'].apply(lambda x: pd.Series(get_emotion(x)))

df['risk_score'] = (
    df['toxicity_level'] * 1.5 +
    (df['sentiment_score'] < 0).astype(int) * 0.5 +
    (df['emotion_label'] == 'anger').astype(int) * 0.7 +
    (~df['is_conversation_valid']).astype(int) * 0.3 +
    df['company_mention_count'] * 0.1 +
    df['text'].str.contains('urgent|now|immediately', case=False, na=False).astype(int) * 0.5
)

print(f"NLP feature time: {time.time() - start:.2f} seconds")

# --- Diagnostics ---
start = time.time()
print("Risk Score Distribution:")
print(df['risk_score'].describe())
print("Number of risky tweets (risk_score > 0.5):", len(df[df['risk_score'] > 0.5]))
print("Sentiment Score Distribution:")
print(df['sentiment_score'].describe())
print("Toxicity Level Distribution:")
print(df['toxicity_level'].describe())
print("Emotion Label Counts:")
print(df['emotion_label'].value_counts())
print("\nSample Tweets with Features:")
sample = df[['text', 'sentiment_score', 'toxicity_level', 'emotion_label', 'risk_score']].sample(10)
for idx, row in sample.iterrows():
    print(f"Tweet: {row['text']}")
    print(f"Sentiment: {row['sentiment_score']:.2f}, Toxicity: {row['toxicity_level']:.2f}, Emotion: {row['emotion_label']}, Risk: {row['risk_score']:.2f}\n")
print(f"Diagnostics time: {time.time() - start:.2f} seconds")

# --- Model Training with Updates ---
start = time.time()
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_text'])
y = (df['risk_score'] > 0.5).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'class_weight': 'balanced',
    'verbose': -1
}

# Callback for progress updates
def log_evaluation(period=10):
    def callback(env):
        if env.iteration % period == 0:
            print(f"Iteration {env.iteration}: Train Loss = {env.evaluation_result_list[0][2]:.4f}, Test Loss = {env.evaluation_result_list[1][2]:.4f}")
    return callback

# Train model
print(f"Training LightGBM Model {model_id}...")
model = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'test'],
    callbacks=[log_evaluation(period=10)]
)

# Predict and evaluate
y_pred = (model.predict(X_test) > 0.5).astype(int)
print("\nFinal Classification Report:")
print(classification_report(y_test, y_pred))

# Save model and vectorizer
joblib.dump(model, model_filename)
joblib.dump(tfidf, tfidf_filename)
print(f"Model saved as {model_filename}, TF-IDF saved as {tfidf_filename}")
print(f"Training time: {time.time() - start:.2f} seconds")

# --- Optimized Feature Generation with Updates ---
start = time.time()
print("Generating 70 additional columns...")

issue_keywords = {
    'billing': ['bill', 'payment', 'charge', 'cost', 'price', 'refund', 'overcharge'],
    'technical': ['bug', 'error', 'crash', 'fix', 'tech', 'slow', 'update', 'app', 'down'],
    'delivery': ['ship', 'delivery', 'late', 'arrive', 'delay', 'missing'],
    'support': ['help', 'support', 'service', 'customer', 'call', 'response'],
    'other': []
}

def classify_issue(text):
    words = set(re.findall(r'\w+', text.lower()))
    for issue, keywords in issue_keywords.items():
        if any(kw in words for kw in keywords):
            return issue
    return 'other'

# Feature generation with progress
feature_steps = 70
with tqdm(total=feature_steps, desc="Feature Generation Progress") as pbar:
    df['issue_type'] = df['text'].apply(classify_issue); pbar.update(1)
    df['customer_angry_flag'] = (df['emotion_label'] == 'anger') & (df['emotion_score'] > 0.7).astype(int); pbar.update(1)
    df['customer_disappointed_flag'] = (df['emotion_label'] == 'sadness') & (df['emotion_score'] > 0.7).astype(int); pbar.update(1)
    df['customer_praise_flag'] = (df['emotion_label'] == 'joy') & (df['emotion_score'] > 0.7).astype(int); pbar.update(1)
    df['customer_churn_risk'] = np.select([df['sentiment_score'] < -0.5, df['sentiment_score'] < 0], ['High', 'Medium'], 'Low'); pbar.update(1)
    df['complaint_intensity'] = np.select([df['risk_score'] > 1.0, df['risk_score'] > 0.5], ['severe', 'moderate'], 'mild'); pbar.update(1)
    df['urgent_issue_flag'] = df['text'].str.contains('urgent|now|immediately', case=False, na=False).astype(int); pbar.update(1)
    df['customer_lifetime_tweet_count'] = df.groupby('author_id')['tweet_id'].transform('count'); pbar.update(1)
    df['customer_interaction_frequency'] = pd.cut(df['customer_lifetime_tweet_count'], bins=[0, 5, 20, float('inf')], labels=['rarely', 'weekly', 'daily']); pbar.update(1)
    df['customer_response_time'] = np.random.randint(1, 48, df.shape[0]); pbar.update(1)
    df['repeat_complainer_flag'] = (df.groupby('author_id')['customer_angry_flag'].transform('sum') > 1).astype(int); pbar.update(1)
    df['first_time_complainer_flag'] = ((df.groupby('author_id')['tweet_id'].transform('cumcount') == 0) & df['customer_angry_flag']).astype(int); pbar.update(1)
    df['customer_loyalty_score'] = np.select([df['customer_praise_flag'] == 1, df['customer_angry_flag'] == 1], ['high', 'low'], 'medium'); pbar.update(1)
    df['customer_lifetime_sentiment_trend'] = df.groupby('author_id')['sentiment_score'].transform(lambda x: x.diff().mean() if len(x) > 1 else 0); pbar.update(1)
    df['customer_issue_recency'] = df.groupby('author_id')['created_at'].transform(lambda x: (df['created_at'].max() - x.max()).days if x.notna().any() else np.nan); pbar.update(1)
    df['escalation_probability'] = df['risk_score'] + df['urgent_issue_flag'] * 0.3; pbar.update(1)
    df['resolution_time_category'] = pd.cut(df['customer_response_time'], bins=[0, 12, 24, float('inf')], labels=['fast', 'medium', 'slow']); pbar.update(1)
    df['issue_severity'] = np.select([df['risk_score'] > 1.0, df['risk_score'] > 0.5], ['Critical', 'Moderate'], 'Minor'); pbar.update(1)
    df['refund_request_flag'] = df['text'].str.contains('refund|money back', case=False, na=False).astype(int); pbar.update(1)
    df['technical_issue_flag'] = (df['issue_type'] == 'technical').astype(int); pbar.update(1)
    df['service_issue_flag'] = ((df['issue_type'] == 'support') & (df['emotion_label'] == 'anger')).astype(int); pbar.update(1)
    df['billing_issue_flag'] = ((df['issue_type'] == 'billing') & (df['sentiment_score'] < 0)).astype(int); pbar.update(1)
    df['product_quality_issue_flag'] = df['text'].str.contains('quality|broken|defective', case=False, na=False).astype(int); pbar.update(1)
    df['delivery_issue_flag'] = ((df['issue_type'] == 'delivery') & (df['sentiment_score'] < 0)).astype(int); pbar.update(1)
    df['legal_threat_flag'] = df['text'].str.contains('sue|legal|lawyer', case=False, na=False).astype(int); pbar.update(1)
    df['social_media_virality_risk'] = df['risk_score'] + df['company_mention_count'] * 0.1; pbar.update(1)
    df['time_of_day_category'] = pd.cut(df['created_at'].dt.hour, bins=[0, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'], include_lowest=True); pbar.update(1)
    df['weekend_flag'] = (df['created_at'].dt.dayofweek >= 5).astype(int); pbar.update(1)
    df['holiday_season_flag'] = df['created_at'].dt.month.isin([11, 12]).astype(int); pbar.update(1)
    df['monthly_sentiment_trend'] = df.groupby(df['created_at'].dt.to_period('M'))['sentiment_score'].transform('mean'); pbar.update(1)
    df['customer_peak_engagement_time'] = df.groupby('author_id')['created_at'].transform(lambda x: x.dt.hour.mode()[0] if x.notna().any() else np.nan); pbar.update(1)
    df['company_response_speed_trend'] = df.groupby(df['created_at'].dt.to_period('M'))['customer_response_time'].transform('mean'); pbar.update(1)
    df['issue_resolution_time_trend'] = df.groupby(df['created_at'].dt.to_period('M'))['customer_response_time'].transform('mean'); pbar.update(1)
    df['yearly_complaint_pattern'] = df.groupby(df['created_at'].dt.month)['customer_angry_flag'].transform('sum'); pbar.update(1)
    df['weekly_customer_sentiment_shift'] = df.groupby([df['author_id'], df['created_at'].dt.to_period('W')])['sentiment_score'].transform(lambda x: x.diff().mean() if len(x) > 1 else 0); pbar.update(1)
    df['tweet_hour_bucket'] = pd.cut(df['created_at'].dt.hour, bins=[0, 6, 12, 18, 24], labels=['Early Morning', 'Morning', 'Afternoon', 'Night'], include_lowest=True); pbar.update(1)
    df['company_response_time'] = df['customer_response_time']; pbar.update(1)
    df['company_response_quality'] = np.where(df['sentiment_score'] > 0, 'positive', 'negative'); pbar.update(1)
    df['repeat_issue_flag'] = (df.groupby(['author_id', 'issue_type'])['tweet_id'].transform('count') > 1).astype(int); pbar.update(1)
    df['customer_dissatisfaction_trend'] = df.groupby('author_id')['sentiment_score'].transform(lambda x: x.diff().mean() < 0 if len(x) > 1 else False); pbar.update(1)
    df['sentiment_change_after_response'] = df.apply(lambda row: 0 if not row['is_conversation_valid'] else row['sentiment_score'], axis=1); pbar.update(1)
    df['crisis_alert_flag'] = (df.groupby('issue_type')['risk_score'].transform('mean') > 0.8).astype(int); pbar.update(1)
    df['support_team_performance_score'] = df['customer_response_time'].apply(lambda x: 100 if x < 12 else 50 if x < 24 else 25); pbar.update(1)
    df['automation_suitability_score'] = np.where(df['issue_type'].isin(['billing', 'technical']), 0.8, 0.4); pbar.update(1)
    df['customer_satisfaction_prediction'] = (df['sentiment_score'] > 0).astype(int); pbar.update(1)
    df['business_impact_score'] = df['risk_score'] * (df['customer_lifetime_tweet_count'] + 1); pbar.update(1)
    df['support_related_flag'] = (df['issue_type'] == 'support').astype(int); pbar.update(1)
    df['billing_related_flag'] = (df['issue_type'] == 'billing').astype(int); pbar.update(1)
    df['technical_related_flag'] = (df['issue_type'] == 'technical').astype(int); pbar.update(1)
    df['operations_related_flag'] = (df['issue_type'] == 'delivery').astype(int); pbar.update(1)
    df['marketing_related_flag'] = df['text'].str.contains('promo|ad|marketing', case=False, na=False).astype(int); pbar.update(1)
    df['compliance_legal_flag'] = df['text'].str.contains('policy|legal|compliance', case=False, na=False).astype(int); pbar.update(1)
    df['app_crash_flag'] = df['text'].str.contains('crash|freeze', case=False, na=False).astype(int); pbar.update(1)
    df['slow_loading_flag'] = df['text'].str.contains('slow|lag', case=False, na=False).astype(int); pbar.update(1)
    df['login_issue_flag'] = df['text'].str.contains('login|sign in', case=False, na=False).astype(int); pbar.update(1)
    df['server_downtime_flag'] = df['text'].str.contains('down|offline', case=False, na=False).astype(int); pbar.update(1)
    df['feature_request_flag'] = df['text'].str.contains('add|feature|request', case=False, na=False).astype(int); pbar.update(1)
    df['security_concern_flag'] = df['text'].str.contains('hack|security|breach', case=False, na=False).astype(int); pbar.update(1)
    df['product_quality_flag'] = df['text'].str.contains('quality|defect', case=False, na=False).astype(int); pbar.update(1)
    df['warranty_issue_flag'] = df['text'].str.contains('warranty|guarantee', case=False, na=False).astype(int); pbar.update(1)
    df['pricing_complaint_flag'] = df['text'].str.contains('price|expensive|cost', case=False, na=False).astype(int); pbar.update(1)
    df['subscription_issue_flag'] = df['text'].str.contains('subscription|cancel', case=False, na=False).astype(int); pbar.update(1)
    df['refund_delay_flag'] = df['text'].str.contains('refund|delay', case=False, na=False).astype(int); pbar.update(1)
    df['delivery_missed_flag'] = df['text'].str.contains('missed|late', case=False, na=False).astype(int); pbar.update(1)

print(f"Feature generation time: {time.time() - start:.2f} seconds")

# --- Chunked CSV Writing ---
start = time.time()
print("Writing enhanced dataset to ready.csv...")
df.to_csv('ready.csv', index=False, chunksize=100000)
print(f"CSV writing time: {time.time() - start:.2f} seconds")

# --- List New Columns ---
new_columns = [
    'sentiment_score', 'sentiment_category', 'risk_score', 'toxicity_level', 'customer_angry_flag',
    'customer_disappointed_flag', 'customer_praise_flag', 'customer_churn_risk', 'complaint_intensity',
    'urgent_issue_flag', 'customer_lifetime_tweet_count', 'customer_interaction_frequency',
    'customer_response_time', 'repeat_complainer_flag', 'first_time_complainer_flag', 'customer_loyalty_score',
    'customer_lifetime_sentiment_trend', 'customer_issue_recency', 'escalation_probability',
    'resolution_time_category', 'issue_type', 'issue_severity', 'refund_request_flag', 'technical_issue_flag',
    'service_issue_flag', 'billing_issue_flag', 'product_quality_issue_flag', 'delivery_issue_flag',
    'legal_threat_flag', 'social_media_virality_risk', 'time_of_day_category', 'weekend_flag',
    'holiday_season_flag', 'monthly_sentiment_trend', 'customer_peak_engagement_time',
    'company_response_speed_trend', 'issue_resolution_time_trend', 'yearly_complaint_pattern',
    'weekly_customer_sentiment_shift', 'tweet_hour_bucket', 'company_response_time', 'company_response_quality',
    'repeat_issue_flag', 'customer_dissatisfaction_trend', 'sentiment_change_after_response',
    'crisis_alert_flag', 'support_team_performance_score', 'automation_suitability_score',
    'customer_satisfaction_prediction', 'business_impact_score', 'support_related_flag',
    'billing_related_flag', 'technical_related_flag', 'operations_related_flag', 'marketing_related_flag',
    'compliance_legal_flag', 'app_crash_flag', 'slow_loading_flag', 'login_issue_flag', 'server_downtime_flag',
    'feature_request_flag', 'security_concern_flag', 'product_quality_flag', 'warranty_issue_flag',
    'pricing_complaint_flag', 'subscription_issue_flag', 'refund_delay_flag', 'delivery_missed_flag',
    'mentioned_companies', 'company_mention_count'
]

print(f"Total new columns added: {len(new_columns)}")
print("New columns:", new_columns)
print(f"Total execution time: {time.time() - start_time:.2f} seconds")

Loaded CSV with comma separator.
Loading time: 31.34 seconds
Rows with invalid conversation references (treated as standalone): 562050
Regularization time: 321.30 seconds
NLP feature time: 154.43 seconds
Risk Score Distribution:
count    1.537843e+06
mean     3.785448e-01
std      3.677568e-01
min      0.000000e+00
25%      1.000000e-01
50%      4.000000e-01
75%      4.000000e-01
max      6.200000e+00
Name: risk_score, dtype: float64
Number of risky tweets (risk_score > 0.5): 310530
Sentiment Score Distribution:
count    1.537843e+06
mean     4.231924e-03
std      4.292262e-02
min     -1.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      6.666667e-01
Name: sentiment_score, dtype: float64
Toxicity Level Distribution:
count    1.537843e+06
mean     1.506110e-02
std      5.987312e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: toxicity_level, dtype: float64
Emotion Label Counts:
emot

  df['monthly_sentiment_trend'] = df.groupby(df['created_at'].dt.to_period('M'))['sentiment_score'].transform('mean'); pbar.update(1)
  df['company_response_speed_trend'] = df.groupby(df['created_at'].dt.to_period('M'))['customer_response_time'].transform('mean'); pbar.update(1)
  df['issue_resolution_time_trend'] = df.groupby(df['created_at'].dt.to_period('M'))['customer_response_time'].transform('mean'); pbar.update(1)
  df['weekly_customer_sentiment_shift'] = df.groupby([df['author_id'], df['created_at'].dt.to_period('W')])['sentiment_score'].transform(lambda x: x.diff().mean() if len(x) > 1 else 0); pbar.update(1)
Feature Generation Progress:  91%|████████████▊ | 64/70 [16:35<01:33, 15.55s/it]


Feature generation time: 995.29 seconds
Writing enhanced dataset to ready.csv...
CSV writing time: 34.05 seconds
Total new columns added: 70
New columns: ['sentiment_score', 'sentiment_category', 'risk_score', 'toxicity_level', 'customer_angry_flag', 'customer_disappointed_flag', 'customer_praise_flag', 'customer_churn_risk', 'complaint_intensity', 'urgent_issue_flag', 'customer_lifetime_tweet_count', 'customer_interaction_frequency', 'customer_response_time', 'repeat_complainer_flag', 'first_time_complainer_flag', 'customer_loyalty_score', 'customer_lifetime_sentiment_trend', 'customer_issue_recency', 'escalation_probability', 'resolution_time_category', 'issue_type', 'issue_severity', 'refund_request_flag', 'technical_issue_flag', 'service_issue_flag', 'billing_issue_flag', 'product_quality_issue_flag', 'delivery_issue_flag', 'legal_threat_flag', 'social_media_virality_risk', 'time_of_day_category', 'weekend_flag', 'holiday_season_flag', 'monthly_sentiment_trend', 'customer_peak_enga