# ***Importing Dependencies***

In [137]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.linear_model import LogisticRegression     # Logistic Regression
from sklearn.svm import SVC                             # Support Vector Machine Algorithm
from sklearn.ensemble import RandomForestClassifier     # Random Forest Classifier
from sklearn.neighbors import KNeighborsClassifier      # K-Nearest Classifier
from sklearn.naive_bayes import GaussianNB              # Naives Bayes Classifier
from sklearn.tree import DecisionTreeClassifier         # Decision Tree Classifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score

# ***Text Preprocessing***

## _Removing URL, Special Characters, Lowering text, Punctuation_

In [138]:
#
# Removing URL, Special Characters, Lowering text, Punctuation, Stopwords remove
import re #regular expression
import string

def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers,
    Removing URL's, hashtags, mentions, and special characters.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("[0-9" "]+"," ",text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\@\w+|\#", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

clean = lambda x: clean_text(x)

In [139]:
df=pd.read_csv("tweet.csv")
df['tweets']=df.tweets.apply(clean)
df['class'].unique()

array(['figurative', 'irony', 'regular', 'sarcasm'], dtype=object)

## _Remove Stopwords_

In [140]:
# Remove Stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['tweets'] = df['tweets'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['tweets']

0        aware dirty step get money staylight staywhite...
1             sarcasm people dont understand diy artattack
2        iminworkjeremy medsingle dailymail readers sen...
3                      wilw get feeling like games sarcasm
4        teacherarthurg rweingarten probably missed tex...
                               ...                        
81403    photo image via heart childhood cool funny sar...
81404    never knewi better put universe lolmaybe there...
81405    hey wanted say thanks puberty letting apart it...
81406    im sure coverage like fox news special hidden ...
81407                           wont believe see p sarcasm
Name: tweets, Length: 81408, dtype: object

## *Lemmatization*

In [141]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Function to apply Porter stemming to a single word
def apply_lemmatization(word):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)

# Apply lemmatization to the 'text' column of the DataFrame
df['tweets'] = df['tweets'].apply(lambda x: ' '.join([apply_lemmatization(word) for word in word_tokenize(x)]))


_As our Classification is Multi Class classification. The Categorical Class unique replaced with 0,1,2,3_

In [142]:
# replacing for categorical values in class figurative=0, irony=1, regular=2, sarcasm=3
df['class'].replace(['figurative', 'irony', 'regular', 'sarcasm'],[0, 1, 2, 3], inplace=True)
df['class'].unique()

array([0, 1, 2, 3], dtype=int64)

# **Word Embedding With Genism**

## **Pretrained Glove Word2Vec Model**

In [143]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

# Load pre-trained Word2Vec model (GloVe in this case)
glove_model_path = 'glove.6B.100d.txt'  # Provide the path to your GloVe model file
word2vec_output = "word2vec.txt"
glove2word2vec(glove_model_path, word2vec_output)

# Model
glove_model = KeyedVectors.load_word2vec_format(word2vec_output, binary=False)



  glove2word2vec(glove_model_path, word2vec_output)


_Removing of tweets that are not in Glove Vocubulary_

In [144]:
from nltk.tokenize import word_tokenize
# Assuming tweets need to be tokenized
df_tweets = df['tweets'].apply(word_tokenize)

features = []
removed_tweets = []
removed_indices = []

for i, tweet_tokens in enumerate(df_tweets):
    # Preprocess tweet: remove out-of-vocabulary words
    preprocessed_tweet = [word for word in tweet_tokens if word in glove_model.key_to_index]
    if preprocessed_tweet:
        tweet_embedding = [glove_model.get_vector(word) for word in preprocessed_tweet]
        tweet_embedding = np.mean(tweet_embedding, axis=0)  # Average word vectors
        features.append(tweet_embedding)
    else:
        removed_tweets.append(df['tweets'][i])
        removed_indices.append(i)

# Convert the feature list to a numpy array
features = np.array(features)

# Create a DataFrame for removed tweets and indices
removed_df = pd.DataFrame({'Removed_Tweets': removed_tweets, 'Indices': removed_indices})

# Print the number of removed tweets
print(f"Number of removed tweets: {len(removed_tweets)}")

Number of removed tweets: 11


### **Vectorization using GloVe**

In [145]:
# Defining Dependant and Inde
X = df['tweets']
y = df['class']

# Tokenize tweets
sentences = [nltk.word_tokenize(tweet) for tweet in X]

# Vectorization using GloVe
def average_word_vectors(tokens, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    nwords = 0
    for word in tokens:
        if word in model:
            nwords += 1
            feature_vector = np.add(feature_vector, model[word][:num_features])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

X_vec = [average_word_vectors(tokens, glove_model, 100) for tokens in sentences]

X_vec = np.array(X_vec)

# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



## **Classification Algorithms Applying**

### Logistic Regression Algorithm

In [146]:
# Logistic Regression Classifier
logreg_classifier = LogisticRegression(max_iter=10000)

# Train the model
logreg_classifier.fit(X_train_scaled, y_train)

# Predictions
y_pred = logreg_classifier.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy of Logistic Regression is: {accuracy*100}")
print("Classification Report of Logistic Regression:\n", report)

Accuracy of Logistic Regression is: 63.19248249600786
Classification Report of Logistic Regression:
               precision    recall  f1-score   support

           0       0.33      0.23      0.27      4179
           1       0.64      0.73      0.68      4276
           2       0.86      0.91      0.88      3696
           3       0.63      0.70      0.66      4131

    accuracy                           0.63     16282
   macro avg       0.61      0.64      0.62     16282
weighted avg       0.60      0.63      0.61     16282



### Support Vector Classfier Algorithm

In [147]:
# SVM Classifier
svm_classifier = SVC(kernel='linear')
# Train the model
svm_classifier.fit(X_train_scaled, y_train)
# Predictions
y_pred = svm_classifier.predict(X_test_scaled)
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy of Support Vector Classifier is: {accuracy*100}")
print("Classification Report Support Vector Classifier:\n", report)


Accuracy of Support Vector Classifier is: 65.02272448102198
Classification Report Support Vector Classifier:
               precision    recall  f1-score   support

           0       0.32      0.12      0.18      4179
           1       0.62      0.82      0.71      4276
           2       0.87      0.90      0.88      3696
           3       0.62      0.79      0.69      4131

    accuracy                           0.65     16282
   macro avg       0.61      0.66      0.62     16282
weighted avg       0.60      0.65      0.61     16282



### Random Forest Classifier Algorithm

In [148]:
#Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy of Random Forest Classifier: {accuracy*100}")
print("Classification Report of Random Forest Classifier:\n", report)


Accuracy of Random Forest Classifier: 49.50865986979486
Classification Report of Random Forest Classifier:
               precision    recall  f1-score   support

           0       0.08      0.07      0.07      4179
           1       0.45      0.48      0.47      4276
           2       0.86      0.90      0.88      3696
           3       0.52      0.59      0.55      4131

    accuracy                           0.50     16282
   macro avg       0.48      0.51      0.49     16282
weighted avg       0.47      0.50      0.48     16282



### Decision Tree Classifier Algorithm

In [149]:
# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model
dt_classifier.fit(X_train_scaled, y_train)

# Predictions
y_pred = dt_classifier.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy of Decision Tree Classifier: {accuracy*100}")
print("Classification Report Decision Tree Classifier:\n", report)


Accuracy of Decision Tree Classifier: 34.332391598083774
Classification Report Decision Tree Classifier:
               precision    recall  f1-score   support

           0       0.13      0.17      0.15      4179
           1       0.28      0.24      0.26      4276
           2       0.74      0.69      0.71      3696
           3       0.36      0.32      0.34      4131

    accuracy                           0.34     16282
   macro avg       0.38      0.35      0.36     16282
weighted avg       0.37      0.34      0.35     16282



### K-Nearest Classifier

In [150]:
# K-Nearest Neighbors (KNN) Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_classifier.fit(X_train_scaled, y_train)

# Predictions
y_pred = knn_classifier.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy of K-Nearest Classifier: {accuracy*100}")
print("Classification Report of K-Nearest Classifier:\n", report)


Accuracy of K-Nearest Classifier: 47.33447979363714
Classification Report of K-Nearest Classifier:
               precision    recall  f1-score   support

           0       0.23      0.30      0.26      4179
           1       0.44      0.40      0.42      4276
           2       0.93      0.73      0.82      3696
           3       0.50      0.50      0.50      4131

    accuracy                           0.47     16282
   macro avg       0.52      0.48      0.50     16282
weighted avg       0.51      0.47      0.49     16282



### Naives Bayes Classifier

In [151]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()

# Train the model
nb_classifier.fit(X_train_scaled, y_train)

# Predictions
y_pred = nb_classifier.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy of Naives Bayes Classifier: {accuracy*100}")
print("Classification Report Naives Bayes Classifier:\n", report)


Accuracy of Naives Bayes Classifier: 46.26581501044098
Classification Report Naives Bayes Classifier:
               precision    recall  f1-score   support

           0       0.29      0.67      0.41      4179
           1       0.55      0.30      0.39      4276
           2       0.88      0.58      0.70      3696
           3       0.65      0.32      0.42      4131

    accuracy                           0.46     16282
   macro avg       0.59      0.47      0.48     16282
weighted avg       0.58      0.46      0.47     16282



### Gaussian Naive Bayes

In [152]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

# Gaussian Naive Bayes Classifier
gnb_classifier = GaussianNB()

# Train the model
gnb_classifier.fit(X_train_scaled, y_train)

# Predictions
y_pred = gnb_classifier.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy of Gaussian Naive Bayes: {accuracy*100}")
print("Classification Report of Gaussian Naive Bayes:\n", report)


Accuracy of Gaussian Naive Bayes: 55.49072595504237
Classification Report of Gaussian Naive Bayes:
               precision    recall  f1-score   support

           0       0.30      0.21      0.25      4179
           1       0.55      0.57      0.56      4276
           2       0.71      0.79      0.75      3696
           3       0.57      0.68      0.62      4131

    accuracy                           0.55     16282
   macro avg       0.53      0.56      0.54     16282
weighted avg       0.53      0.55      0.54     16282



# **Artificial Neural Network**

_**Model building By using ANN kears(sequencial).**_

In [153]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert labels to one-hot encoding
y_encoded = to_categorical(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y_encoded, test_size=0.2, random_state=42)

# Convert labels to one-hot encoding
y_encoded = to_categorical(y)

# Define the Neural Network model
model = Sequential()
model.add(Dense(128, input_dim=100, activation='sigmoid'))
model.add(Dense(64, activation='relu'))
model.add(Dense(4, activation='softmax'))  # Adjust the number of output nodes based on your classes

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, validation_split=0.1)

# Convert one-hot encoded labels back to original labels
y_test_classes = np.argmax(y_test, axis=1)

# Evaluate the model
_, accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Accuracy: {accuracy}")

# Predictions
y_pred = model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)

# Convert one-hot encoded labels back to original labels
y_test_classes = np.argmax(y_test, axis=1)

# Evaluation
accuracy = accuracy_score(y_test_classes, y_pred_classes)
report = classification_report(y_test_classes, y_pred_classes)

print(f"Accuracy of Sequential Model is: {accuracy*100}")
print("Classification Report:\n", report)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 0.6669942140579224
Accuracy of Sequential Model is: 66.69942267534701
Classification Report:
               precision    recall  f1-score   support

           0       0.34      0.08      0.13      4179
           1       0.62      0.85      0.72      4276
           2       0.91      0.91      0.91      3696
           3       0.62      0.85      0.72      4131

    accuracy                           0.67     16282
   macro avg       0.62      0.67      0.62     16282
weighted avg       0.61      0.67      0.61     16282



# Predict using Custom Statement

In [154]:
import nltk
import numpy as np

def preprocess_and_predict(text, model, tokenizer, word_vectorizer):
    # Step 1: Preprocess the Text
    cleaned_text = clean_text(text)
    # Step 2: Tokenize and Vectorize the Text
    tokens = tokenizer(cleaned_text)
    vector = word_vectorizer(tokens, glove_model, 100)  # Adjust the dimensions based on your GloVe model
    vector = np.array(vector).reshape(1, -1)  # Reshape to match the expected input shape (1, 200)
    # Step 3: Make Predictions
    prediction = model.predict(vector)
    predicted_class = np.argmax(prediction)
    class_labels = {0: "Figurative",1: "Irony",2: "Regular",3: "Sarcasm"}
    predicted_label = class_labels.get(predicted_class, "Unknown")
    # Print the predicted label
    print(f"Predicted Class of Tweet: {predicted_label}")
    return predicted_class

# Example Usage
text_input = "people who call me crazy are usually some of the most #brainwashed #addicted to #propaganda #narratives #irony http://t.co/5diHQRA52G"
predicted_class = preprocess_and_predict(text_input, model, nltk.word_tokenize, average_word_vectors)

Predicted Class of Tweet: Irony


# Pickle File For Depoyment

In [155]:
import pickle
pickle.dump(model, open('model.pkl','wb'))
