# Disaster Tweet Analyzer: NLP for Crisis Communication
This notebook focuses on preprocessing the dataset of disaster-related tweets to clean up the text data for further analysis.

In [1]:
# Import necessary libraries
import pandas as pd
import re

In [2]:
# Load the dataset
file_path = 'disaster_tweets.csv'  # Replace with your dataset file path
tweets_df = pd.read_csv('/Users/rithvik/Downloads/infosys/tweets.csv')

# Display the first 1134 rows of the dataset
tweets_df.head(1134)

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0
...,...,...,...,...,...
1129,1129,blizzard,,"ill drink iced coffee in a snow blizzard, idc lol",0
1130,1130,blizzard,젤나가..맙소사..,Overwatch Toxicity and Cheating Sanction Janua...,0
1131,1131,blizzard,"New York, USA",Blizzard is pushing World of Warcraft into the...,0
1132,1132,blizzard,Australia,Found that out when it was still on the blizza...,0


In [3]:
# Check for missing values
missing_values = tweets_df.isnull().sum()
print("Missing values per column:\n", missing_values)

# Fill missing text values with an empty string (if any)
tweets_df['text'].fillna('', inplace=True)

Missing values per column:
 id             0
keyword        0
location    3418
text           0
target         0
dtype: int64


In [6]:
# Improved text cleanup function
def clean_tweet(text):
    # Remove URLs using regex
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove all non-alphabetic characters and numbers, keeping only words
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert text to lowercase for uniformity
    text = text.lower()
    # Strip extra spaces
    text = text.strip()
    return text

In [5]:
# Apply the cleanup function to the tweet text
tweets_df['cleaned_text'] = tweets_df['text'].apply(clean_tweet)

# Display the original and cleaned text side by side
print(tweets_df[['text', 'cleaned_text']].head(15))

# Save the cleaned dataset if needed
tweets_df.to_csv('cleaned_disaster_tweets.csv', index=False)

                                                 text  \
0   Communal violence in Bhainsa, Telangana. "Ston...   
1   Telangana: Section 144 has been imposed in Bha...   
2   Arsonist sets cars ablaze at dealership https:...   
3   Arsonist sets cars ablaze at dealership https:...   
4   "Lord Jesus, your love brings freedom and pard...   
5   If this child was Chinese, this tweet would ha...   
6   Several houses have been set ablaze in Ngemsib...   
7   Asansol: A BJP office in Salanpur village was ...   
8   National Security Minister, Kan Dapaah's side ...   
9   This creature who’s soul is no longer clarent ...   
10  Images showing the havoc caused by the #Camero...   
11  Social media went bananas after Chuba Hubbard ...   
12  Hausa youths set Area Office of Apapa-Iganmu L...   
13  Under #MamataBanerjee political violence &amp;...   
14  AMEN! Set the whole system ablaze, man. https:...   

                                         cleaned_text  
0   communal violence in bhains