In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tensorflow.keras import layers
from emoji import demojize


train_file = r"C:\Users\Swapnil\Downloads\train.En (1).csv"

contractions = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there would",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

# Load and preprocess the data
def load_data(filename):
    df = pd.read_csv(filename)
    X = df["tweet"].astype(str).str.lower()  # Ensure X is string type
    X = X.apply(lambda x: demojize(x))  # Convert emojis to text
    X = X.apply(lambda x: expand_contractions(x, contractions))  # Expand contractions
    y = df.sarcastic.values
    
    vectorizer = CountVectorizer()
    vectorizer.fit(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train = vectorizer.transform(X_train).toarray()
    X_test = vectorizer.transform(X_test).toarray()
    
    return X_train, X_test, y_train, y_test, vectorizer

def expand_contractions(text, contractions_dict):
    if isinstance(text, str):
        for contraction, expansion in contractions_dict.items():
            text = text.replace(contraction, expansion)
    elif isinstance(text, np.ndarray):
        # Vectorize the function to handle arrays element-wise
        expand_contractions_vectorized = np.vectorize(lambda x: expand_contractions(x, contractions_dict), otypes=[object])
        text = expand_contractions_vectorized(text)
    return text

# Define the LSTM model
def create_model(input_shape, hidden_units):
    model = tf.keras.Sequential([
        layers.Reshape((1, input_shape[0]), input_shape=input_shape),
        layers.LSTM(hidden_units),
        layers.Dense(1)
    ])
    return model

# Load and preprocess the data
X_train, X_test, y_train, y_test, vectorizer = load_data(train_file)

# initialize model, criterion, and optimizer
model = create_model(input_shape=X_train.shape[1:], hidden_units=128)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

# Save the trained model
model.save("lstm_model.h5")
