In [40]:
import pandas as pd
import nltk
import spacy
import re
from tqdm import tqdm
import numpy as np
from nltk.corpus import stopwords
nlp=spacy.load('en_core_web_lg')
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split

# Data Cleaning

In [41]:
df=pd.read_csv('train.csv')

In [42]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [43]:
df.drop(columns=['id','keyword','location'],inplace=True)

In [44]:
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [45]:
df.isnull().sum()

text      0
target    0
dtype: int64

In [46]:
df['target'].unique()

array([1, 0], dtype=int64)

In [47]:
df.rename(columns={'text':'tweets'},inplace=True)

In [48]:
def processed_tweets(tweet):
    tweet=str(tweet).lower()
    tweet=re.sub(r"http\S+|www|S+|https\S+",'',tweet,flags=re.MULTILINE)
    tweet=re.sub(r"@\w+|#\w+",'',tweet)
    tweet=re.sub(r"[^\w\s]",'',tweet)
    tweet=re.sub(r"\d+",'',tweet)
    tokens=[token.lemma_ for token in nlp(tweet)]
    tokens=[token for token in tokens if token not in stopwords.words('english')]
    tokens=[token for token in tokens if len(token)>1]
    processed_tweet=' '.join(tokens)
    return processed_tweet
    
df['clean_tweets']=[processed_tweets(tw) for tw in tqdm(df['tweets'],position=0,leave=True)]


100%|██████████████████████████████████████████████████████████████████████████████| 7613/7613 [02:01<00:00, 62.75it/s]


In [49]:
df.drop(columns='tweets', inplace=True)

In [50]:
df.to_csv('C:/Users/User/OneDrive/Desktop/Ai-Disaster-Alert-System/Data/Processed/cleaned_tweets.csv', index=False)

# Performing EDA

In [51]:
df=pd.read_csv('C:/Users/User/OneDrive/Desktop/Ai-Disaster-Alert-System/Data/Processed/cleaned_tweets.csv')

In [13]:

#pos_count=len(df[df['target']==1])
#neg_count=len(df[df['target']==0])
#fig=px.pie(values=[pos_count,neg_count],title='Distibution of target',names=['positive','negative'],opacity=.9)
#fig.show()

In [52]:
df.head()


Unnamed: 0,target,clean_tweets
0,1,deed reason may allah forgive
1,1,forest fire near la ronge sask canada
2,1,resident ask shelter place notify officer evac...
3,1,people receive evacuation order california
4,1,get send photo ruby smoke pour school


In [15]:
#plt.figure(figsize=(20,8))
#plt.subplot(1,2,1)
#wordcloud1=WordCloud(width=1200,height=800,max_words=1000,contour_width=2,background_color='white',max_font_size=180,
                     #colormap='viridis').generate(' '.join(df[df['target']==1]['clean_tweets'].dropna().astype(str)))
#plt.imshow(wordcloud1,interpolation='bilinear')
#plt.axis('off')
#plt.title('positive')

#plt.subplot(1,2,2)
#wordcloud2=WordCloud(width=1200,height=800,max_words=1000,contour_width=2,background_color='black',max_font_size=180,
                     #colormap='viridis').generate(' '.join(df[df['target']==0]['clean_tweets'].dropna().astype(str)))
#plt.imshow(wordcloud2,interpolation='bilinear')
#plt.axis('off')
#plt.title('negative')
#plt.tight_layout()
#plt.show()

# Splitting the Dataset

In [53]:
## Preprocessing the Data
from datasets import Dataset
x_train,x_test,y_train,y_test=train_test_split(df['clean_tweets'].tolist(),df['target'].tolist(),test_size=0.2,stratify=df['target'],random_state=42)
# Sanitize inputs: ensure list of strings
x_train = [str(text) for text in x_train]
x_test = [str(text) for text in x_test]

train_df = pd.DataFrame({'Clean_tweets': x_train, 'target': y_train})
test_df = pd.DataFrame({'Clean_tweets': x_test, 'target': y_test})

x_train_dataset=Dataset.from_pandas(train_df)
x_test_dataset=Dataset.from_pandas(test_df)


In [54]:
## Tokenizing with Hugging face
from transformers import BertTokenizer
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(dataset):
    encoding = tokenizer(dataset['Clean_tweets'], padding="max_length", truncation=True, max_length=128)
    encoding["labels"] = dataset["target"]  # <-- Key Fix
    return encoding
x_train_dataset = x_train_dataset.map(tokenize_function, batched=True)
x_test_dataset = x_test_dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [18]:
## Setting format to pytorch
x_train_dataset.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
x_test_dataset.set_format(type='torch',columns=['input_ids','attention_mask','labels'])








# Building and Training the model

In [19]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import os

# Set safe output directory
output_dir = r"C:\MLTraining\bert_results"
os.makedirs(output_dir, exist_ok=True)

# Create logging directory inside output_dir
logging_dir = os.path.join(output_dir, "logs")
os.makedirs(logging_dir, exist_ok=True)

# Load pre-trained BERT
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
trainingArgs = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="no",  # Disable checkpoint saving
    logging_dir=logging_dir,
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=trainingArgs,
    train_dataset=x_train_dataset,
    eval_dataset=x_test_dataset,
)

# Train
trainer.train(resume_from_checkpoint=False)

# Save final model
trainer.save_model(output_dir)
print("✅ Training complete. Model saved successfully.")


W0710 06:32:06.972000 10072 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3597,0.45554
2,0.4107,0.422375
3,0.2302,0.506053


✅ Training complete. Model saved successfully.


In [33]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(r"C:\MLTraining\bert_results")


W0710 11:40:34.203000 20064 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [56]:
from transformers import TrainingArguments, Trainer

trainingArgs = TrainingArguments(
    output_dir =r"C:\Users\User\OneDrive\Desktop\Ai-Disaster-Alert-System\Models\MLTraining",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="no",
    logging_dir =r"C:\Users\User\OneDrive\Desktop\Ai-Disaster-Alert-System\Models\MLTraining\logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=trainingArgs,
)


In [57]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import torch

# Step 1: Predict on x_test_dataset
predictions = trainer.predict(x_test_dataset)

# Step 2: Get predicted labels (argmax of logits)
y_pred = np.argmax(predictions.predictions, axis=1)

# Step 3: Get true labels
y_true = np.array(x_test_dataset["target"])  # or x_test_dataset["labels"] depending on column name

# Step 4: Evaluation
print("🔍 Classification Report:")
print(classification_report(y_true, y_pred))

print("📊 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

acc = accuracy_score(y_true, y_pred)
print(f"✅ Accuracy: {acc * 100:.2f}%")




🔍 Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       869
           1       0.81      0.77      0.79       654

    accuracy                           0.83      1523
   macro avg       0.82      0.82      0.82      1523
weighted avg       0.82      0.83      0.82      1523

📊 Confusion Matrix:
[[751 118]
 [148 506]]
✅ Accuracy: 82.53%


In [58]:
!pip install fpdf


Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py): started
  Building wheel for fpdf (setup.py): finished with status 'done'
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40712 sha256=06b2d0adbebd052073d4363ad77f1d496875245268fb64a3e7c80a836f06f0a9
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\65\4f\66\bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


  DEPRECATION: Building 'fpdf' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'fpdf'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [60]:
from fpdf import FPDF
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

# === Create output dir for report ===
report_dir = r"C:\Users\User\OneDrive\Desktop\Ai-Disaster-Alert-System\Reports"
os.makedirs(report_dir, exist_ok=True)

# === Save confusion matrix image ===
conf_matrix = confusion_matrix(y_true, y_pred)
conf_df = pd.DataFrame(conf_matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
plt.figure(figsize=(6, 5))
sns.heatmap(conf_df, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
conf_path = os.path.join(report_dir, "conf_matrix.png")
plt.savefig(conf_path)
plt.close()

# === Prepare PDF ===
pdf = FPDF()
pdf.add_page()

# Title
pdf.set_font("Arial", "B", 16)
pdf.cell(200, 10, "AI Disaster Alert System - Model Evaluation Report", ln=True, align='C')

# Accuracy
pdf.ln(10)
pdf.set_font("Arial", "", 12)
pdf.cell(200, 10, f"Accuracy: {acc * 100:.2f}%", ln=True)

# Classification Report
report_text = classification_report(y_true, y_pred)
pdf.ln(10)
pdf.set_font("Arial", "B", 14)
pdf.cell(200, 10, "Classification Report:", ln=True)
pdf.set_font("Courier", "", 10)
for line in report_text.splitlines():
    pdf.cell(200, 6, line.strip().encode("latin-1", "ignore").decode("latin-1"), ln=True)

# Confusion Matrix Image
pdf.add_page()
pdf.set_font("Arial", "B", 14)
pdf.cell(200, 10, "Confusion Matrix:", ln=True)
pdf.image(conf_path, x=40, y=30, w=130)

# Save PDF
pdf_path = os.path.join(report_dir, "model_evaluation_report.pdf")
pdf.output(pdf_path)

print(f"📄 PDF saved to: {pdf_path}")


📄 PDF saved to: C:\Users\User\OneDrive\Desktop\Ai-Disaster-Alert-System\Reports\model_evaluation_report.pdf
