<a href="https://colab.research.google.com/github/loki20051267/NLP/blob/main/12th_sep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Conv1D, GlobalMaxPooling1D, LSTM, Dropout


df = pd.read_csv("tweets.csv")
print(df.head())


import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[A-Za-z0-9_]+','', text)
    text = re.sub(r'#','', text)
    text = re.sub(r'http\S+|www.\S+','', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

df['clean'] = df['text'].apply(clean_text)

X = df['clean']
y = df['target']


count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(X)

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_test_tfidf)

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)

def evaluate_model(y_true, y_pred, name="Model"):
    print(f"\n{name} Report:")
    print(classification_report(y_true, y_pred, digits=4))

evaluate_model(y_test, y_pred_lr, "Logistic Regression (TF-IDF)")
evaluate_model(y_test, y_pred_svm, "SVM (TF-IDF)")

# =========================
# 6. WORD EMBEDDINGS (Tokenizer + Padding)
# =========================
max_words = 10000
max_len = 50
embedding_dim = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(sequences, maxlen=max_len, padding='post')

X_train_pad, X_test_pad, y_train_pad, y_test_pad = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# =========================
# 7. DEEP LEARNING MODELS
# =========================

# ---- (a) MLP on averaged embeddings ----
mlp = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
mlp.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
mlp.fit(X_train_pad, y_train_pad, epochs=5, batch_size=32, validation_split=0.2, verbose=1)
mlp_pred = (mlp.predict(X_test_pad) > 0.5).astype(int)
evaluate_model(y_test_pad, mlp_pred, "MLP (Embeddings)")

# ---- (b) 1D CNN ----
cnn = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn.fit(X_train_pad, y_train_pad, epochs=5, batch_size=32, validation_split=0.2, verbose=1)
cnn_pred = (cnn.predict(X_test_pad) > 0.5).astype(int)
evaluate_model(y_test_pad, cnn_pred, "CNN (Embeddings)")

# ---- (c) LSTM ----
lstm = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])
lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm.fit(X_train_pad, y_train_pad, epochs=5, batch_size=32, validation_split=0.2, verbose=1)
lstm_pred = (lstm.predict(X_test_pad) > 0.5).astype(int)
evaluate_model(y_test_pad, lstm_pred, "LSTM (Embeddings)")


   id keyword        location  \
0   0  ablaze             NaN   
1   1  ablaze             NaN   
2   2  ablaze   New York City   
3   3  ablaze  Morgantown, WV   
4   4  ablaze             NaN   

                                                text  target  
0  Communal violence in Bhainsa, Telangana. "Ston...       1  
1  Telangana: Section 144 has been imposed in Bha...       1  
2  Arsonist sets cars ablaze at dealership https:...       1  
3  Arsonist sets cars ablaze at dealership https:...       1  
4  "Lord Jesus, your love brings freedom and pard...       0  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Logistic Regression (TF-IDF) Report:
              precision    recall  f1-score   support

           0     0.8804    0.9920    0.9329      1878
           1     0.9051    0.3611    0.5162       396

    accuracy                         0.8821      2274
   macro avg     0.8927    0.6766    0.7246      2274
weighted avg     0.8847    0.8821    0.8603      2274


SVM (TF-IDF) Report:
              precision    recall  f1-score   support

           0     0.8933    0.9941    0.9410      1878
           1     0.9402    0.4369    0.5966       396

    accuracy                         0.8971      2274
   macro avg     0.9168    0.7155    0.7688      2274
weighted avg     0.9015    0.8971    0.8810      2274

Epoch 1/5




[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.8039 - loss: 0.4748 - val_accuracy: 0.8863 - val_loss: 0.3242
Epoch 2/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9190 - loss: 0.2170 - val_accuracy: 0.8885 - val_loss: 0.3234
Epoch 3/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9772 - loss: 0.1144 - val_accuracy: 0.8857 - val_loss: 0.3915
Epoch 4/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.9926 - loss: 0.0408 - val_accuracy: 0.8802 - val_loss: 0.4619
Epoch 5/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9958 - loss: 0.0205 - val_accuracy: 0.8654 - val_loss: 0.5014
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

MLP (Embeddings) Report:
              precision    recall  f1-score   support

           0     0.9356    0.9207    0.9281   



[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.7941 - loss: 0.4880 - val_accuracy: 0.8863 - val_loss: 0.3051
Epoch 2/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.9355 - loss: 0.1936 - val_accuracy: 0.8901 - val_loss: 0.3148
Epoch 3/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9795 - loss: 0.0755 - val_accuracy: 0.8709 - val_loss: 0.3927
Epoch 4/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.9924 - loss: 0.0294 - val_accuracy: 0.8747 - val_loss: 0.5220
Epoch 5/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.9943 - loss: 0.0194 - val_accuracy: 0.8736 - val_loss: 0.5395
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step

CNN (Embeddings) Report:
              precision    recall  f1-score   support

           0     0.9232    0.9286    0.9259   



[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 140ms/step - accuracy: 0.8083 - loss: 0.5064 - val_accuracy: 0.8121 - val_loss: 0.4849
Epoch 2/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 117ms/step - accuracy: 0.8088 - loss: 0.4901 - val_accuracy: 0.8121 - val_loss: 0.4836
Epoch 3/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 117ms/step - accuracy: 0.8103 - loss: 0.4869 - val_accuracy: 0.8121 - val_loss: 0.4869
Epoch 4/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 118ms/step - accuracy: 0.8143 - loss: 0.4810 - val_accuracy: 0.8121 - val_loss: 0.4853
Epoch 5/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 124ms/step - accuracy: 0.8129 - loss: 0.4838 - val_accuracy: 0.8121 - val_loss: 0.4833
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step

LSTM (Embeddings) Report:
              precision    recall  f1-score   support

           0     0.8259    1.0000 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
