In [None]:
import pandas as pd

Loading test data

In [None]:
train = pd.read_csv('/content/drive/MyDrive/train.csv')

Filling missing values

In [None]:
missing_values = train.isnull().sum()
print(missing_values)

id             0
keyword       61
place       2533
tweet          0
disaster       0
dtype: int64


In [None]:
train['keyword'].fillna('missing_keyword', inplace=True)
train['place'].fillna('unknown', inplace=True)

In [None]:
missing_values = train.isnull().sum()
print(missing_values)

id          0
keyword     0
place       0
tweet       0
disaster    0
dtype: int64


Define stopwords and preprocessing functions

In [None]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [None]:
STOPWORDS = set(stopwordlist)

In [None]:
def cleaning_stopwords(tweet):
    return " ".join([word for word in str(tweet).split() if word not in STOPWORDS])

In [None]:
train['tweet'].iloc[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [None]:
import string
def cleaning_punctuations(tweet):
    translator = str.maketrans('', '', string.punctuation)
    return tweet.translate(translator)

In [None]:
import re
def cleaning_repeating_char(tweet):
    return re.sub(r'(.)\1+', r'\1', tweet)

In [None]:
def cleaning_URLs(tweet):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet)

In [None]:
def cleaning_numbers(tweet):
    return re.sub('[0-9]+', '', tweet)

Apply preprocessing to tweets

In [None]:
train['tweet'] = train['tweet'].apply(lambda x: cleaning_stopwords(x))
train['tweet'] = train['tweet'].apply(lambda x: cleaning_punctuations(x))
train['tweet'] = train['tweet'].apply(lambda x: cleaning_repeating_char(x))
train['tweet'] = train['tweet'].apply(lambda x: cleaning_URLs(x))
train['tweet'] = train['tweet'].apply(lambda x: cleaning_numbers(x))

Lemmatization

In [None]:
import nltk

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
lemmatizer = nltk.WordNetLemmatizer()

In [None]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ, "N": nltk.corpus.wordnet.NOUN, "V": nltk.corpus.wordnet.VERB, "R": nltk.corpus.wordnet.ADV}
    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

In [None]:
def lemmatize_words(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    return ' '.join(lemmatized_words)

In [None]:
train['lemmatized_tweet'] = train['tweet'].apply(lemmatize_words)

Define preprocessor and pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(stop_words='english', max_features=10000), 'lemmatized_tweet')
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

Split the data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train[['lemmatized_tweet']]
y = train['disaster']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

Train the model

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

Evaluate the model

In [None]:
X_train_pred = pipeline.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_pred)
print("Training Accuracy:", training_data_accuracy)

X_test_pred = pipeline.predict(X_test)
testing_data_accuracy = accuracy_score(y_test, X_test_pred)
print("Testing Accuracy:", testing_data_accuracy)

Training Accuracy: 0.8827586206896552
Testing Accuracy: 0.8069599474720945


Save the model

In [None]:
import pickle

In [None]:
with open('disaster_prediction_model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

Load the model

In [None]:
import pickle
import pandas as pd

with open('disaster_prediction_model.pkl', 'rb') as file:
    best_model = pickle.load(file)

Load Test data

In [None]:
test = pd.read_csv('/content/drive/MyDrive/test.csv')

Applying the same preprocessing steps to the test data

In [None]:
test['keyword'].fillna('missing_keyword', inplace=True)
test['place'].fillna('unknown', inplace=True)
test['tweet'] = test['tweet'].apply(lambda x: cleaning_stopwords(x))
test['tweet'] = test['tweet'].apply(lambda x: cleaning_punctuations(x))
test['tweet'] = test['tweet'].apply(lambda x: cleaning_repeating_char(x))
test['tweet'] = test['tweet'].apply(lambda x: cleaning_URLs(x))
test['tweet'] = test['tweet'].apply(lambda x: cleaning_numbers(x))
test['lemmatized_tweet'] = test['tweet'].apply(lemmatize_words)

Make predictions

In [None]:
predictions = best_model.predict(test[['lemmatized_tweet']])

Add predictions to the DataFrame and save to a new CSV

In [None]:
test['disaster'] = predictions
test.to_csv('predicted_disasters.csv', index=False)

print(test)

         id          keyword    place  \
0         0  missing_keyword  unknown   
1         2  missing_keyword  unknown   
2         3  missing_keyword  unknown   
3         9  missing_keyword  unknown   
4        11  missing_keyword  unknown   
...     ...              ...      ...   
3258  10861  missing_keyword  unknown   
3259  10865  missing_keyword  unknown   
3260  10868  missing_keyword  unknown   
3261  10874  missing_keyword  unknown   
3262  10875  missing_keyword  unknown   

                                                  tweet  \
0                        Just hapened terible car crash   
1     Heard earthquake diferent cities stay safe eve...   
2     forest fire spot pond gese fleing acros stret ...   
3                 Apocalypse lighting Spokane wildfires   
4                    Typhon Soudelor kils  China Taiwan   
...                                                 ...   
3258  EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...   
3259  Storm RI worse last huricane 