In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE


In [6]:
# === 1. Load Dataset ===
df = pd.read_csv('datasets/combined_sarcasm1.csv')

In [7]:
# Drop missing values
df.dropna(subset=['tweet', 'sarcasm'], inplace=True)

In [8]:
# === 2. Feature and Label Extraction ===
X = df['tweet']
y = df['sarcasm']

In [9]:
# === 3. TF-IDF Vectorization ===
vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = vectorizer.fit_transform(X)

In [10]:
# === 4. Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, stratify=y, random_state=42
)


In [11]:
import pandas as pd

# Load the dataset
df = pd.read_csv('datasets/combined_sarcasm1.csv')

# Display the distribution of sarcasm labels
print("Class distribution:")
print(df['sarcasm'].value_counts())

# Optional: Display percentage
print("\nClass distribution in percentage:")
print(df['sarcasm'].value_counts(normalize=True) * 100)


Class distribution:
sarcasm
1.0    506081
0.0    505557
Name: count, dtype: int64

Class distribution in percentage:
sarcasm
1.0    50.025899
0.0    49.974101
Name: proportion, dtype: float64


In [12]:
# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [13]:
import os
import joblib

# Create directories if they don't exist
os.makedirs('models', exist_ok=True)
os.makedirs('vectorizers', exist_ok=True)

# Save model and vectorizer
joblib.dump(model, 'models/sarcasm_model.pkl')
joblib.dump(vectorizer, 'vectorizers/sarcasm_vectorizer.pkl')



['vectorizers/sarcasm_vectorizer.pkl']

In [14]:

# Predictions and evaluation
y_pred = model.predict(X_test)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))



=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.68      0.73      0.70    101112
         1.0       0.71      0.65      0.68    101216

    accuracy                           0.69    202328
   macro avg       0.69      0.69      0.69    202328
weighted avg       0.69      0.69      0.69    202328



In [None]:
# --- Prediction Interface ---
def predict_sarcasm(text):
    vector = vectorizer.transform([text])
    prediction = model.predict(vector)[0]
    return prediction

# Interactive mode
if __name__ == "__main__":
    while True:
        user_input = input("\nEnter a tweet to classify as sarcastic (1) or not (0) (type 'exit' to quit):\n> ")
        if user_input.lower() == 'exit':
            break
        label = predict_sarcasm(user_input)
        print(f"Predicted Label: {'Sarcastic' if label == 1 else 'Not Sarcastic'}")

Predicted Label: Not Sarcastic


: 