In [852]:
import pandas as pd
import spacy
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import fasttext

In [853]:
nlp = spacy.load("en_core_web_sm")

## Load Sentiment140 dataset


In [854]:
senti_df = pd.read_csv('../data/raw/sentiment140.csv', names=['target', 'id', 'date', 'flag', 'user', 'text'])
print(senti_df.shape)
senti_df.head(5)

(1600000, 6)


Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [855]:
senti_df.target.value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

There are only 0 and 4 as values of target. `0: negative` , `4: positive`

In [856]:
senti_df.flag.value_counts()

flag
NO_QUERY    1600000
Name: count, dtype: int64

In [857]:
senti_df = senti_df[['target', 'text']]
print(senti_df.shape)
senti_df.head(5)

(1600000, 2)


Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Load eyewitness dataset


In [858]:
df1 = pd.read_csv('../data/raw/eyewitness_tweets_annotations_14k_public/labeled_by_authors/earthquakes_eyewitness_annotations_2000.tsv', sep='\t')
print(df1.shape)
df1.head(5)

(2000, 15)


Unnamed: 0,text,label,source,created_at,lang,listed_count,verified,location,statuses_count,followers_count,favourites_count,time_zone,user_lang,friends_count,screen_name
0,#earthquake Magnitude 2.1 occurred 159km NE of...,non-eyewitness,"<a href=""http://www.didyoufeel.it/"" rel=""nofol...",Tue Aug 01 08:12:42 +0000 2017,en,7.0,f,Italia,92091.0,1248.0,1.0,Central America,en,1756.0,did_you_feel_it
1,Retweeted Earthquakes Tsunamis (@NewEarthquake...,non-eyewitness,"<a href=""http://www.facebook.com/twitter"" rel=...",Tue Aug 01 08:13:08 +0000 2017,en,287.0,f,Malaysia,100720.0,1680.0,13260.0,Kuala Lumpur,en,2486.0,LimAiYim
2,I always know I need to go shopping when I've ...,don't know,"<a href=""https://mobile.twitter.com"" rel=""nofo...",Tue Aug 01 08:13:15 +0000 2017,en,0.0,f,"Hollywood, California, USA..... planet earth",2206.0,89.0,959.0,,en,85.0,3DennisCarlins
3,Update: M2.0 #earthquake (#sismo) strikes 1 km...,non-eyewitness,"<a href=""http://www.emsc-csem.org/"" rel=""nofol...",Tue Aug 01 08:13:48 +0000 2017,en,392.0,f,"BruyÃ¨res-Le-Chatel, France",98956.0,8970.0,1.0,Belgrade,fr,3.0,EMSC
4,ã€,non-eyewitness,"<a href=""http://twdesk.com/"" rel=""nofollow"">ã",Tue Aug 01 08:14:58 +0000 2017,en,54.0,f,With everyone,265090.0,4165.0,29.0,Irkutsk,ja,3800.0,from___japan


In [859]:
df1.isnull().sum()

text                  0
label                 0
source                8
created_at            8
lang                  8
listed_count          8
verified              8
location            571
statuses_count        8
followers_count       8
favourites_count      8
time_zone           671
user_lang             8
friends_count         8
screen_name           8
dtype: int64

In [860]:
df1 = df1[['label', 'text']]  
print(df1.shape)
df1.head(3)

(2000, 2)


Unnamed: 0,label,text
0,non-eyewitness,#earthquake Magnitude 2.1 occurred 159km NE of...
1,non-eyewitness,Retweeted Earthquakes Tsunamis (@NewEarthquake...
2,don't know,I always know I need to go shopping when I've ...


In [861]:
df2 = pd.read_csv('../data/raw/eyewitness_tweets_annotations_14k_public/labeled_by_authors/floods_eyewitness_annotations_2000.tsv', sep='\t')
print(df2.shape)
df2.head(5)

(2000, 15)


Unnamed: 0,text,lable,source,created_at,lang,listed_count,verified,location,statuses_count,followers_count,favourites_count,time_zone,user_lang,friends_count,screen_name
0,There are still like 8473 bugs. This is the wo...,direct-eyewitness,"<a href=""http://twitter.com/download/iphone"" r...",Tue Aug 01 00:07:43 +0000 2017,en,71,f,,190274,2190,15301,Eastern Time (US & Canada),en,1502,PupsAndPucks
1,I am now at the right bank to cross the flood....,direct-eyewitness,"<a href=""http://twitter.com/download/android"" ...",Tue Aug 01 00:36:25 +0000 2017,en,0,f,Ahmedabad,14,21,3,Mumbai,en,5,rahulmtomar
2,859 PM MDT: Very heavy rain near Red Bluff Lak...,direct-eyewitness,"<a href=""http://www.hootsuite.com"" rel=""nofoll...",Tue Aug 01 02:59:21 +0000 2017,en,290,t,"Midland, Texas",14756,6251,695,Central Time (US & Canada),en,135,NWSMidland
3,The thunder and heavy rainfall woke me up seve...,direct-eyewitness,"<a href=""http://twitter.com/download/iphone"" r...",Wed Aug 02 00:08:05 +0000 2017,en,5,f,"Pasig City, Manila Philippines",7268,375,1259,,en,340,iamkringmanalo
4,Got a little bit of rain today #onstorm #orang...,direct-eyewitness,"<a href=""http://twitter.com/download/iphone"" r...",Wed Aug 02 00:10:45 +0000 2017,en,1,f,"Orangeville, Canada",899,195,116,Central Time (US & Canada),en,762,_leafsgal_


In [862]:
df2.rename(columns={'lable': 'label'}, inplace=True)

In [863]:
df2 = df2[['label', 'text']]  
print(df2.shape)
df2.head(3)

(2000, 2)


Unnamed: 0,label,text
0,direct-eyewitness,There are still like 8473 bugs. This is the wo...
1,direct-eyewitness,I am now at the right bank to cross the flood....
2,direct-eyewitness,859 PM MDT: Very heavy rain near Red Bluff Lak...


In [864]:
df3 = pd.read_csv('../data/raw/eyewitness_tweets_annotations_14k_public/labeled_by_authors/hurricanes_eyewitness_annotations_2004.tsv', sep='\t')
print(df3.shape)
df3.head(5)

(2000, 15)


Unnamed: 0,text,Lable,source,created_at,lang,listed_count,verified,location,statuses_count,followers_count,favourites_count,time_zone,user_lang,friends_count,screen_name
0,I'm at the outer edge of the projected hurrica...,direct-eyewitness,"<a href=""http://twitter.com/download/iphone"" r...",Fri Aug 25 02:00:19 +0000 2017,en,1,f,TX,35118,41,804,Central Time (US & Canada),en,82,gothxingmi
1,school was cancelled today due to the heavy ra...,direct-eyewitness,"<a href=""http://twitter.com/download/iphone"" r...",Fri Aug 25 12:38:54 +0000 2017,en,0,f,"houston, texas",122,198,38,,en,254,hurtingjohnson
2,"So much shit is happening to us in Texas,, hur...",direct-eyewitness,"<a href=""http://twitter.com/download/iphone"" r...",Fri Aug 25 13:35:32 +0000 2017,en,2,f,"Houston, TX",41575,254,9155,,en,459,kimhandsoap
3,sooo last year it decides to flood right befor...,direct-eyewitness,"<a href=""http://twitter.com/download/iphone"" r...",Fri Aug 25 14:33:30 +0000 2017,en,1,f,,927,236,247,,en,266,_hguilbeau_
4,So we're having an earthquake AND a hurricane ...,direct-eyewitness,"<a href=""http://twitter.com/download/iphone"" r...",Fri Aug 25 14:58:46 +0000 2017,en,1,f,"Dallas, TX",6096,225,4682,,en,123,jimenez_410


In [865]:
df3.rename(columns={'Lable': 'label'}, inplace=True)

In [866]:
df3.isnull().sum()

text                  1
label                 0
source                0
created_at            0
lang                  0
listed_count          0
verified              0
location            497
statuses_count        0
followers_count       0
favourites_count      0
time_zone           777
user_lang             0
friends_count         0
screen_name           0
dtype: int64

In [867]:
df3 = df3[['label', 'text']]  
print(df3.shape)
df3.head(3)

(2000, 2)


Unnamed: 0,label,text
0,direct-eyewitness,I'm at the outer edge of the projected hurrica...
1,direct-eyewitness,school was cancelled today due to the heavy ra...
2,direct-eyewitness,"So much shit is happening to us in Texas,, hur..."


In [868]:
df4 = pd.read_csv('../data/raw/eyewitness_tweets_annotations_14k_public/labeled_by_crowdflower_paid_workers/earthquakes_eyewitness_crowdflower_2000.tsv', sep='\t')
print(df4.shape)
df4.head(3)

(2000, 13)


Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,please_choose_a_category:confidence,created_at,id_str,rownames,please_choose_a_category,text,Unnamed: 11,Unnamed: 12
0,1846692712,False,finalized,3,08/03/2018 20:23,0.6664,Mon May 14 22:49:14 +0000 2018,",996160536487358464,",79959,direct_eyewitness,"TheReformedCrow Nah, I'm gonna go with earthq...",,
1,1846692769,False,finalized,3,08/03/2018 20:46,1.0,Tue May 15 02:19:00 +0000 2018,",996213327398289408,",99307,direct_eyewitness,I think we just had an earthquake,,
2,1846692882,False,finalized,3,08/03/2018 21:36,1.0,Tue May 15 02:19:12 +0000 2018,",996213376370950144,",99629,direct_eyewitness,Uhh who else felt that earthquake tho,,


In [869]:
df4.please_choose_a_category.value_counts()

please_choose_a_category
direct_eyewitness    1600
dont_know             200
noneyewitness         200
Name: count, dtype: int64

In [870]:
df4 = df4[['please_choose_a_category','text']]
df4.rename(columns={'please_choose_a_category': 'label'}, inplace=True)
print(df4.shape)
df4.head(3)

(2000, 2)


Unnamed: 0,label,text
0,direct_eyewitness,"TheReformedCrow Nah, I'm gonna go with earthq..."
1,direct_eyewitness,I think we just had an earthquake
2,direct_eyewitness,Uhh who else felt that earthquake tho


In [871]:
df5 = pd.read_csv('../data/raw/eyewitness_tweets_annotations_14k_public/labeled_by_crowdflower_paid_workers/floods_eyewitness_crowdflower_2000.tsv', sep='\t')
print(df5.shape)
df5.head(3)

(2000, 11)


Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,please_choose_a_category:confidence,created_at,id_str,rownames,please_choose_a_category,text
0,1847304628,False,finalized,3,08/05/2018 22:21,1.0,Thu Jul 05 20:50:28 +0000 2018,",1014974815764037632,",94671,direct_eyewitness,"My state, everybody. Drought, fire, flood, hai..."
1,1847305169,False,finalized,3,08/05/2018 2:21,0.6466,Thu Jul 19 17:10:31 +0000 2018,",1019992896550985728,",27005,direct_eyewitness,Somebody please shoot me! I have been breaki...
2,1847306041,False,finalized,3,08/06/2018 9:08,0.651,Mon Jul 04 16:41:38 +0000 2016,",750006667492855808,",26899,direct_eyewitness,It's fine. It's all fucking fine. I'm just a d...


In [872]:
df5.please_choose_a_category.value_counts()

please_choose_a_category
dont_know            822
direct_eyewitness    627
noneyewitness        551
Name: count, dtype: int64

In [873]:
df5 = df5[['please_choose_a_category','text']]
df5.rename(columns={'please_choose_a_category': 'label'}, inplace=True)
print(df5.shape)
df5.head(3)

(2000, 2)


Unnamed: 0,label,text
0,direct_eyewitness,"My state, everybody. Drought, fire, flood, hai..."
1,direct_eyewitness,Somebody please shoot me! I have been breaki...
2,direct_eyewitness,It's fine. It's all fucking fine. I'm just a d...


In [874]:
df6 = pd.read_csv('../data/raw/eyewitness_tweets_annotations_14k_public/labeled_by_crowdflower_paid_workers/forestfires_eyewitness_crowdflower_2000.tsv', sep='\t')
print(df6.shape)
df6.head(3)

(2000, 13)


Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,please_choose_a_category_:confidence,created_at,id_str,rownames,please_choose_a_category_,text,Unnamed: 11,Unnamed: 12
0,1830763346,False,finalized,3,7/26/2018 12:30:49,1.0,Sat May 07 07:17:21 +0000 2016,",728846163798020096,",50034,direct_eyewitness,Damn this Canadian Wildfire got the whole city...,,
1,1830764158,False,finalized,3,7/26/2018 11:16:12,0.6493,Tue Oct 10 18:41:33 +0000 2017,",917822427903623168,",3296,direct_eyewitness,Devastating to see such widespread destruction...,,
2,1830764254,False,finalized,3,7/26/2018 11:24:49,0.6678,Tue Oct 10 19:55:29 +0000 2017,",917841031520849920,",12921,direct_eyewitness,Sonoma county is covered in heavy thick black ...,,


In [875]:
df6.please_choose_a_category_.value_counts()

please_choose_a_category_
noneyewitness        1379
dont_know             432
direct_eyewitness     189
Name: count, dtype: int64

In [876]:
df6 = df6[['please_choose_a_category_','text']]
df6.rename(columns={'please_choose_a_category_': 'label'}, inplace=True)
print(df6.shape)
df6.head(3)

(2000, 2)


Unnamed: 0,label,text
0,direct_eyewitness,Damn this Canadian Wildfire got the whole city...
1,direct_eyewitness,Devastating to see such widespread destruction...
2,direct_eyewitness,Sonoma county is covered in heavy thick black ...


In [877]:
df7 = pd.read_csv('../data/raw/eyewitness_tweets_annotations_14k_public/labeled_by_crowdflower_paid_workers/hurricanes_eyewitness_crowdflower_2000.tsv', sep='\t')
print(df7.shape)
df7.head(3)

(2000, 12)


Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_a_category:confidence,created_at,id_str,rownames,choose_a_category,text,Unnamed: 11
0,1834490755,False,finalized,5,7/27/2018 13:14:01,0.8022,Wed Mar 29 22:45:42 +0000 2017,",847218235002847232,",38791,direct_eyewitness,Ex cyclone Debbie is bringing some bad weather...,
1,1834490430,False,finalized,4,7/27/2018 20:04:56,0.4957,Thu Mar 30 11:12:34 +0000 2017,",847406190225006592,",53402,direct_eyewitness,We r receiving the tail end of cyclone debbie ...,
2,1834490462,False,finalized,3,7/27/2018 18:06:19,1.0,Fri Mar 31 01:34:08 +0000 2017,",847623008940204037,",62651,direct_eyewitness,"Little bit late I know, Cyclone Debbie done me...",


In [878]:
df7.choose_a_category.value_counts()

choose_a_category
noneyewitness        1199
direct_eyewitness     465
dont_know             336
Name: count, dtype: int64

In [879]:
df7 = df7[['choose_a_category','text']]
df7.rename(columns={'choose_a_category': 'label'}, inplace=True)
print(df7.shape)
df7.head(3)

(2000, 2)


Unnamed: 0,label,text
0,direct_eyewitness,Ex cyclone Debbie is bringing some bad weather...
1,direct_eyewitness,We r receiving the tail end of cyclone debbie ...
2,direct_eyewitness,"Little bit late I know, Cyclone Debbie done me..."


In [880]:
disaster_df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=0, ignore_index=True)
print(disaster_df.shape)
disaster_df.head(5)

(14000, 2)


Unnamed: 0,label,text
0,non-eyewitness,#earthquake Magnitude 2.1 occurred 159km NE of...
1,non-eyewitness,Retweeted Earthquakes Tsunamis (@NewEarthquake...
2,don't know,I always know I need to go shopping when I've ...
3,non-eyewitness,Update: M2.0 #earthquake (#sismo) strikes 1 km...
4,non-eyewitness,ã€


In [881]:
disaster_df.isnull().sum()

label    0
text     1
dtype: int64

In [882]:
# Drop rows with missing 'text' or 'label'
disaster_df = disaster_df.dropna(subset=['text', 'label'])
print(disaster_df.shape)
disaster_df.head(3)

(13999, 2)


Unnamed: 0,label,text
0,non-eyewitness,#earthquake Magnitude 2.1 occurred 159km NE of...
1,non-eyewitness,Retweeted Earthquakes Tsunamis (@NewEarthquake...
2,don't know,I always know I need to go shopping when I've ...


In [883]:
disaster_df.label.value_counts()

label
don't know                      4652
noneyewitness                   3329
direct_eyewitness               2881
dont_know                       1790
non-eyewitness                   534
direct-eyewitness                513
vulnerable-direct witness        185
vulnerable direct-eyewitness      84
indirect-eyewitness               31
Name: count, dtype: int64

In [884]:
disaster_df.sample(15)

Unnamed: 0,label,text
5041,don't know,"Her heart burns with desire like a volcano, he..."
1732,don't know,If an earthquake occurred tomorrow &amp; left ...
4973,don't know,"Happy Earth day to The man, The volcano, The l..."
2024,direct-eyewitness,Since i'm house arrest today due to the flood ...
12800,noneyewitness,Cyclone Debbie in context: how severe was the ...
10885,noneyewitness,Canada Wildfire: Convoy Moves Stranded Evacuee...
11134,noneyewitness,"ABC News Latest details on path, destruction ..."
3118,don't know,They flocked to the execution ground like an u...
7031,direct_eyewitness,YOWZA #earthquake
5622,don't know,"1c. Also, you can bet the insurance companies ..."


In [885]:
for i in disaster_df.label.unique():
    print(i)
    print('------------------')
    for j in disaster_df[disaster_df.label == i]['text'].sample(25).to_list():
        print(j)
    print('\n\n')


non-eyewitness
------------------
Kathmandu University flooded, students trying to rescue books.  This and more photo by Sujaan Shrestha on her blogâ€¦ https://t.co/cdukQiRyZ2
#earthquake (EMSC) https://t.co/ifDmBNb7Vf: mb 3.7 CAUCASUS REGION, RUSSIA https://t.co/jpaHEjD3zM
One killed as earthquake hits Italian island - Window To News https://t.co/TdyouMnf9o
#Harvey threatens heavy rainfall/hurricane by Friday. @GovAbbott declared a state of disaster for 30 TX counties: https://t.co/PK7jkbT06S
2.2 earthquake close to Erzincan, Turkey at 04:19 UTC! #earthquake #Erzincan https://t.co/raMccETU7a
0.6 earthquake occurred 53km ENE of Mammoth Lakes, California at 10:42 UTC! #earthquake #MammothLakes https://t.co/8v1mLtMU93
The latest Calgary Flood News! https://t.co/wCXLQIpVjk #ableg #abflood
Magnitude 1.6 earthquake 41km WNW of Goldfield, Nevada https://t.co/9gnS19hFzM
Prelim M5.7 earthquake Solomon Islands Aug-6 00:15 UTC, updates https://t.co/h6w8vdhWTG, 0 #quake tweets/min
2 women save a 

In [886]:
non_real_time_crisis = ["non-eyewitness", "don't know", "dont_know","noneyewitness"]
real_time_crisis = ["direct-eyewitness","vulnerable direct-eyewitness", "vulnerable-direct witness", "indirect-eyewitness", "direct_eyewitness"]

for i in non_real_time_crisis:
    disaster_df['label'] = disaster_df['label'].replace(i, 'non_real_time_crisis')
for i in real_time_crisis:
    disaster_df['label'] = disaster_df['label'].replace(i, 'real_time_crisis')
disaster_df.label.value_counts()

label
non_real_time_crisis    10305
real_time_crisis         3694
Name: count, dtype: int64

In [887]:
for i in disaster_df.label.unique():
    print(i)
    print('------------------')
    for j in disaster_df[disaster_df.label == i]['text'].sample(5).to_list():
        print(j)
    print('\n\n')

non_real_time_crisis
------------------
All of my Boston friends, flood city hall with emails and phone calls next week https://t.co/2vZoBXIvX1
hoseok : can you feel my heart beat ?   * causes earthquake *
Why does it decide to flood in 29 when I'm gone, I'm not ok with that
Can you answer this? The major earthquake that struck Japan was not far offshore https://t.co/sfA9VQhE02
Los Angeles News Search LA (Health risks of flood waters after hurricanes: What to expect after Harvey) 1 Los ... - https://t.co/wfWhMGSkXw



real_time_crisis
------------------
Dead bull shark washes up in severe floods https://t.co/FturCOwjIW
earthquake twitter: ACTIVATE!
Such a horrible little earthquake at 2:00am. I forget how scary they are
no uni today thanks cyclone debbie
I knew those little earthshakes an hour ago would turn into an earthquake. #Oakland





In [888]:
disaster_df.label.value_counts()

label
non_real_time_crisis    10305
real_time_crisis         3694
Name: count, dtype: int64

In [889]:
disaster_df.text.iloc[4]
disaster_df[disaster_df.text == disaster_df.text.iloc[3128]]

Unnamed: 0,label,text
3128,non_real_time_crisis,âš ï¸


In [890]:
disaster_df.head(5)

Unnamed: 0,label,text
0,non_real_time_crisis,#earthquake Magnitude 2.1 occurred 159km NE of...
1,non_real_time_crisis,Retweeted Earthquakes Tsunamis (@NewEarthquake...
2,non_real_time_crisis,I always know I need to go shopping when I've ...
3,non_real_time_crisis,Update: M2.0 #earthquake (#sismo) strikes 1 km...
4,non_real_time_crisis,ã€


In [891]:
disaster_df.label.value_counts()

label
non_real_time_crisis    10305
real_time_crisis         3694
Name: count, dtype: int64

In [851]:
disaster_df

Unnamed: 0,label,text
0,real_time_crisis,I was wondering why the room was shaking when ...
1,real_time_crisis,Another earthquake woke me up at the night. No...
2,real_time_crisis,F*cking hell...my wife and kids are in Tokyo a...
3,real_time_crisis,Was Facetiming my brother in Tokyo when an ear...
4,real_time_crisis,I just called my grandma asking if she was ok ...
...,...,...
7383,non_real_time_crisis,"Another strong bid to change the spelling of ""..."
7384,non_real_time_crisis,man I m so lucky there s no flood here in my n...
7385,non_real_time_crisis,Very powerful #CycloneDebbie has made landfall...
7386,non_real_time_crisis,"Also, California should expect an calamitous e..."


In [892]:
disaster_df.to_csv('../data/processed/disaster_df.csv', index=False)

## Balance the data

In [836]:
disaster_df_non_real_time_crisis = disaster_df[disaster_df.label == 'non_real_time_crisis']
disaster_df_real_time_crisis = disaster_df[disaster_df.label == 'real_time_crisis']
print(disaster_df_non_real_time_crisis.shape)
print(disaster_df_real_time_crisis.shape)
disaster_df_non_real_time_crisis = disaster_df_non_real_time_crisis.sample(n=disaster_df_real_time_crisis.shape[0])
disaster_df = pd.concat([disaster_df_real_time_crisis, disaster_df_non_real_time_crisis], axis=0, ignore_index=True)
print(disaster_df.shape)
disaster_df.label.value_counts()

(10305, 2)
(3694, 2)
(7388, 2)


label
real_time_crisis        3694
non_real_time_crisis    3694
Name: count, dtype: int64

## Split the data

In [837]:
train, test = train_test_split(disaster_df, test_size=0.1, random_state=42, stratify=disaster_df['label'])
train.shape, test.shape

((6649, 2), (739, 2))

In [838]:
train.head(5)

Unnamed: 0,label,text
2675,real_time_crisis,"RodyDu30: RT teddyboylocsin: For two days now,..."
834,real_time_crisis,PolhomeEditor duncanm Buildings first earthq...
2023,real_time_crisis,The rare May earthquake.
4433,non_real_time_crisis,"@CMYogiAditya It's humble request, building ma..."
3897,non_real_time_crisis,And you probably thought Sharknado was a stupi...


In [839]:
test.head(5)

Unnamed: 0,label,text
4334,non_real_time_crisis,can my house not flood? :^)
5342,non_real_time_crisis,"Dan your ""quick break"" didn't come quick enoug..."
3336,real_time_crisis,Am I gonna die? https://t.co/isr23xgeqP
4302,non_real_time_crisis,earthquake rip
4392,non_real_time_crisis,Tasmanian emergency services assist in recover...


## Preprocess text

In [840]:
## Preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    
    # text = re.sub(r'@\w+', '', text)  # Remove mentions
    # text = re.sub(r'#\w+', '', text)  # Remove hashtags
    # text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    # text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = text.strip()  # Remove leading/trailing whitespace


    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]  # Lemmatization and remove stop words
    return ' '.join(tokens)
    

In [841]:
train['text'] = train.text.apply(preprocess_text)
test['text'] = test.text.apply(preprocess_text)

In [842]:
train.head(5)

Unnamed: 0,label,text
2675,real_time_crisis,rodydu rt teddyboylocsin day traffic sure mass...
834,real_time_crisis,polhomeeditor duncanm building earthquake
2023,real_time_crisis,rare earthquake
4433,non_real_time_crisis,cmyogiaditya humble request building collapse ...
3897,non_real_time_crisis,probably think sharknado stupid unrealistic mo...


In [843]:
test.head(5)

Unnamed: 0,label,text
4334,non_real_time_crisis,house flood
5342,non_real_time_crisis,dan quick break not come quick not unsee butt ...
3336,real_time_crisis,go to die httpstcoisrxgeqp
4302,non_real_time_crisis,earthquake rip
4392,non_real_time_crisis,tasmanian emergency service assist recovery ef...


In [844]:
train_cleaned = train[train.text != ''].copy()
test_cleaned = test[test.text != ''].copy()

In [845]:
train_cleaned.head(5)

Unnamed: 0,label,text
2675,real_time_crisis,rodydu rt teddyboylocsin day traffic sure mass...
834,real_time_crisis,polhomeeditor duncanm building earthquake
2023,real_time_crisis,rare earthquake
4433,non_real_time_crisis,cmyogiaditya humble request building collapse ...
3897,non_real_time_crisis,probably think sharknado stupid unrealistic mo...


In [846]:
test.head(5)

Unnamed: 0,label,text
4334,non_real_time_crisis,house flood
5342,non_real_time_crisis,dan quick break not come quick not unsee butt ...
3336,real_time_crisis,go to die httpstcoisrxgeqp
4302,non_real_time_crisis,earthquake rip
4392,non_real_time_crisis,tasmanian emergency service assist recovery ef...


In [847]:
def format_for_fasttext(df, text_col, label_col, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for text, label in zip(df[text_col], df[label_col]):
            f.write(f"__label__{label} {text.strip()}\n")

format_for_fasttext(train_cleaned, 'text', 'label', '../data/processed/train.txt')
format_for_fasttext(test_cleaned, 'text', 'label', '../data/processed/test.txt')

In [848]:
# with open('../data/processed/train.txt', 'r', encoding='utf-8') as f:
#     for i, line in enumerate(f):
#         if not line.strip():
#             print(f"Empty line at {i}")
#         if '__label__' not in line:
#             print(f"Missing label at line {i}")

## Train Fasttext

In [849]:
model = fasttext.train_supervised(input='../data/processed/train.txt', loss='hs')
# model = fasttext.train_supervised(input='../data/processed/debug_train.txt', loss='hs')



Read 0M words
Number of words:  11872
Number of labels: 2
Progress: 100.0% words/sec/thread:  454569 lr:  0.000000 avg.loss:  0.481863 ETA:   0h 0m 0s


In [850]:
model.test('../data/processed/test.txt')

(739, 0.803788903924222, 0.803788903924222)

In [795]:
text = "just felt an hurricane"
labels, probabilities = model.predict(text, k=1)
print(labels[0], probabilities[0])

__label__non_real_time_crisis 0.9838029146194458
