In [2]:
import pandas as pd

### Load Dataset

In [3]:
eq_dataset = pd.read_csv("tweets.csv")
eq_dataset.head()

  eq_dataset = pd.read_csv("tweets.csv")


Unnamed: 0,date,content,hashtags,like_count,rt_count,followers_count,isVerified,language,coordinates,place,source
0,2023-02-21 03:30:04+00:00,तुर्की में सोमवार देर रात भूंकप के तेज झटके मह...,"['ATDigital', 'Turkey', 'Earthquake', 'TurkeyE...",0.0,0.0,19727712.0,True,hi,,,Twitter Media Studio
1,2023-02-21 03:29:07+00:00,New search &amp; rescue work is in progress in...,"['Hatay', 'earthquakes', 'Türkiye', 'TurkiyeQu...",1.0,0.0,5697.0,True,en,,,Twitter Web App
2,2023-02-21 03:29:04+00:00,Can't imagine those who still haven't recovere...,"['Turkey', 'earthquake', 'turkeyearthquake2023...",0.0,0.0,1.0,False,en,,,Twitter for Android
3,2023-02-21 03:28:06+00:00,its a highkey sign for all of us to ponder ove...,"['turkeyearthquake2023', 'earthquake', 'Syria']",0.0,0.0,3.0,False,en,,,Twitter for Android
4,2023-02-21 03:27:38+00:00,Turkiye Earthquake: तुर्किए में फिर आया भूकंप ...,"['turkey', 'earthquake', 'turkiye', 'india', '...",0.0,0.0,17.0,False,und,,,Twitter for Android


In [4]:
eq_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478052 entries, 0 to 478051
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             478052 non-null  object 
 1   content          478052 non-null  object 
 2   hashtags         478034 non-null  object 
 3   like_count       478035 non-null  float64
 4   rt_count         478035 non-null  float64
 5   followers_count  478035 non-null  float64
 6   isVerified       478035 non-null  object 
 7   language         478035 non-null  object 
 8   coordinates      20669 non-null   object 
 9   place            20196 non-null   object 
 10  source           478035 non-null  object 
dtypes: float64(3), object(8)
memory usage: 40.1+ MB


### Data Cleaning and Processing

In [6]:
eq_dataset = eq_dataset[eq_dataset['language'] == 'en']

In [7]:
eq_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 189626 entries, 1 to 478051
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             189626 non-null  object 
 1   content          189626 non-null  object 
 2   hashtags         189626 non-null  object 
 3   like_count       189626 non-null  float64
 4   rt_count         189626 non-null  float64
 5   followers_count  189626 non-null  float64
 6   isVerified       189626 non-null  object 
 7   language         189626 non-null  object 
 8   coordinates      9862 non-null    object 
 9   place            9509 non-null    object 
 10  source           189626 non-null  object 
dtypes: float64(3), object(8)
memory usage: 17.4+ MB


After filtering english only tweets there's ~190,000 tweets left. We will only need the tweets info from the columns `date` and `content`.

In [66]:
df = eq_dataset.loc[:, ['date', 'content']]
df.head()

Unnamed: 0,date,content
1,2023-02-21 03:29:07+00:00,New search &amp; rescue work is in progress in...
2,2023-02-21 03:29:04+00:00,Can't imagine those who still haven't recovere...
3,2023-02-21 03:28:06+00:00,its a highkey sign for all of us to ponder ove...
5,2023-02-21 03:27:27+00:00,"See how strong was the #Earthquake of Feb 20, ..."
6,2023-02-21 03:27:11+00:00,More difficult news today on top of struggles ...


In [67]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

###
# Remove URLs
def remove_urls(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)

df['content'] = df['content'].apply(remove_urls)

###
# Lowercasing
df['content'] = df['content'].str.lower()

# Remove punctuations
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['content'] = df['content'].apply(remove_punctuation)

###
# Tokenization
df['content'] = df['content'].astype(str)
df['content'] = df['content'].apply(word_tokenize)

###
# Remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

df['content'] = df['content'].apply(remove_stopwords)

# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(word) for word in text]

df['content'] = df['content'].apply(lemmatize_text)

### Text Vectorization

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

vectorizer = joblib.load('vectorizer.pk1')

# Join text back into a single string
# Vectorize text
df['content'] = df['content'].apply(" ".join)
X = vectorizer.transform(df['content'])

# Predict

In [27]:
from joblib import load

# import logistic reg. model
model = load("model.joblib")

In [69]:
predictions = model.predict(X) 

In [70]:
# Add predictions as new column
df['predictions'] = predictions 

In [71]:
# Get prediction probabilities too
probabilities = model.predict_proba(X)
df['probability'] = probabilities[:, 1]
df.head()

Unnamed: 0,date,content,predictions,probability
1,2023-02-21 03:29:07+00:00,new search amp rescue work progress hatay two ...,potential_disruption,0.137695
2,2023-02-21 03:29:04+00:00,cant imagine still havent recovered previous t...,no_disruption,0.502322
3,2023-02-21 03:28:06+00:00,highkey sign u ponder action return merciful t...,potential_disruption,0.421465
5,2023-02-21 03:27:27+00:00,see strong earthquake feb 20 2023 hatay turkey...,no_disruption,0.54846
6,2023-02-21 03:27:11+00:00,difficult news today top struggle already faci...,no_disruption,0.599397


In [83]:
# Convert the timestamp to datatime and sort the dframe
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date')
df.drop(columns='index', inplace=True)

### Get Reaction Time

In [97]:
import pytz

# Official happening time for Turkey-Syria earthquake 2023
official_time = pd.to_datetime('2023-02-06 01:17:00').replace(tzinfo=pytz.UTC)

# Filter df to include predictions after the official report
post_earthquake_df = df[df['date'] > official_time]
post_earthquake_df.reset_index(drop=True, inplace=True)

# Find the first instance where the model confidently predicted 2 disruption in a row
confidence_threshold = 0.95

In [98]:
post_earthquake_df.head()

Unnamed: 0,date,content,predictions,probability
0,2023-02-06 01:17:37+00:00,earthquake séisme m28 strike 24 km se monaco m...,potential_disruption,0.12728
1,2023-02-06 01:19:15+00:00,videos2watchnow best way carve turkey like▪️sh...,no_disruption,0.799932
2,2023-02-06 01:19:42+00:00,⚠preliminary info earthquake deprem 30 km n me...,potential_disruption,0.053286
3,2023-02-06 01:21:33+00:00,⚠preliminary info m72 earthquake deprem 30 km ...,potential_disruption,0.030897
4,2023-02-06 01:21:53+00:00,scary earthquake cyprus,potential_disruption,0.157591


In [139]:
df_disrupt = post_earthquake_df[(post_earthquake_df['predictions'] == 'disruption')]
df_disrupt.reset_index(drop=True, inplace=True)
df_disrupt

Unnamed: 0,date,content,predictions,probability
0,2023-02-06 01:24:33+00:00,6 minago earthquake 77 hit gaziantep turkey 62...,disruption,0.249040
1,2023-02-06 01:30:54+00:00,thing missing syria crisis earthquake,disruption,0.342555
2,2023-02-06 01:35:05+00:00,huge 78 earthquake gaziantep turkeymy sisterin...,disruption,0.110869
3,2023-02-06 01:35:09+00:00,whole building shaking along earthquake lebanon,disruption,0.252122
4,2023-02-06 01:38:39+00:00,earthquake felt beirut lebanon 318am aftershoc...,disruption,0.086312
...,...,...,...,...
23813,2023-02-21 03:10:27+00:00,63 earthquake hit turkeysyria border two week ...,disruption,0.007018
23814,2023-02-21 03:10:33+00:00,3 killed 213 injured two earthquake shatter li...,disruption,0.038980
23815,2023-02-21 03:20:52+00:00,kcautv new quake hit battered turkey syria 3 d...,disruption,0.133248
23816,2023-02-21 03:25:13+00:00,new 63 earthquake hit turkey syria border kill...,disruption,0.245734


In [140]:
df_disrupt.iloc[0]['date'] - official_time

Timedelta('0 days 00:07:33')

In [141]:
df_pot_disrupt = post_earthquake_df[(post_earthquake_df['predictions'] == 'potential_disruption')]
df_pot_disrupt.reset_index(inplace=True, drop=True)
df_pot_disrupt

Unnamed: 0,date,content,predictions,probability
0,2023-02-06 01:17:37+00:00,earthquake séisme m28 strike 24 km se monaco m...,potential_disruption,0.127280
1,2023-02-06 01:19:42+00:00,⚠preliminary info earthquake deprem 30 km n me...,potential_disruption,0.053286
2,2023-02-06 01:21:33+00:00,⚠preliminary info m72 earthquake deprem 30 km ...,potential_disruption,0.030897
3,2023-02-06 01:21:53+00:00,scary earthquake cyprus,potential_disruption,0.157591
4,2023-02-06 01:22:17+00:00,earthquake nicosia right,potential_disruption,0.183503
...,...,...,...,...
53003,2023-02-21 03:25:12+00:00,latest earthquake turkey impact disaster recov...,potential_disruption,0.154897
53004,2023-02-21 03:26:11+00:00,64 magnitude quake shake turkey syria border 2...,potential_disruption,0.120743
53005,2023-02-21 03:26:30+00:00,another earthquake southern turkey,potential_disruption,0.188715
53006,2023-02-21 03:28:06+00:00,highkey sign u ponder action return merciful t...,potential_disruption,0.421465


In [142]:
df_pot_disrupt.iloc[0]['date'] - official_time

Timedelta('0 days 00:00:37')

Insights:
1. Model Performance: The model performance in predicting disruptions seemed reasonable to me, although the prediction probabilities aren't that high ranges (below 0.50), which implies the model is still quite unsure about its predictions. 
2. Detection of Actual Disruptions: The first tweet that got predicted as 'disruption' was identified approximately ~7minutes hours after official earthquake time, which I think was a considerably fast.
3. Detection of Potential Disruption: Interestingly, the model was able to detect a 'potential disruption' just 37 seconds after the earthquake. This suggests the model is able to quickly detect signs of disruption, even if it's not immediately sure if a full disruption has occured. 

Some further improvements could be: 
1. Additional model traning / tuning
2. The custom labels were set by me, perhaps it might be better to be reviewed by experts and I suspect this is also the main reason why the prediction probabilities are pretty low
3. Perhaps it will be interesting to use pre-trained language models like BERT for text vectorization