In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


df = pd.read_csv('sample.csv', nrows=20000)  # limit to 20k rows for speed
df = df[['text']]  # keep only tweet text

# 2️⃣ AUTOMATICALLY LABEL CATEGORIES
def label_category(text):
    text = str(text).lower()
    # Billing keywords
    if any(word in text for word in ['invoice','bill','payment','charge','refund','money']):
        return 'Billing'
    #Technical keywords
    elif any(word in text for word in ['error','bug','crash','loading','website','screen','wifi','connection']):
        return 'Technical'
    # Account keywords
    elif any(word in text for word in ['account','password','login','profile','username','email','reset','delete']):
        return 'Account'
    else:
        return None  # ignore tweets that don't match

df['category'] = df['text'].apply(label_category)
df = df[df['category'].notnull()].reset_index(drop=True)

# 3️⃣ TF-IDF FEATURE EXTRACTION
tfidf = TfidfVectorizer(stop_words='english', max_features=30000)
X = tfidf.fit_transform(df['text'])
y = df['category']

# 4️⃣ TRAIN LOGISTIC REGRESSION MODEL
model = LogisticRegression(max_iter=200)
model.fit(X, y)

# List of words the model knows
feature_names = np.array(tfidf.get_feature_names_out())

# 5️⃣ INTERACTIVE PREDICTION FUNCTION
def analyze_message():
    print("\n" + "="*50)
    user_input = input("Enter your support message (or type 'exit' to quit): ")

    if not user_input.strip() or user_input.lower() == 'exit':
        return False  # stop the loop

    input_tfidf = tfidf.transform([user_input])

    if input_tfidf.nnz == 0:
        print("RESULT: Unknown")
        print("REASON: None of these words were in the training data.")
        return True

    # Make prediction
    prediction = model.predict(input_tfidf)[0]

    # Find influential keywords
    class_index = list(model.classes_).index(prediction)
    weights = model.coef_[class_index]
    row_data = input_tfidf.toarray()[0]
    impact_scores = row_data * weights
    important_indices = np.argsort(impact_scores)[::-1]

    keywords = []
    for i in important_indices:
        if impact_scores[i] > 0:
            keywords.append(f"{feature_names[i]} (score: {impact_scores[i]:.2f})")

    # Output
    print(f"\nPREDICTED CATEGORY: {prediction}")
    print(f"INFLUENTIAL KEYWORDS: {', '.join(keywords[:3])}")  # top 3 words
    print("="*50)
    return True

# 6️⃣ RUN INTERACTIVE LOOP
if __name__ == "__main__":
    print("System Ready. Type 'exit' to stop.")
    while True:
        if not analyze_message():
            break



FileNotFoundError: [Errno 2] No such file or directory: 'pretrained_pipeline.pkl'