In [1]:
import re
import pandas as pd
data = pd.read_csv('train_twitter.csv')
df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [2]:
def extract_hashtags(text):
    hashtags = re.findall(r'#[a-zA-Z0-9]+', text)  # Find all hashtags in the text
    return " ".join(hashtags) if hashtags else "No"  # Join hashtags with space or return "#"

df['hashtags'] = df['text'].apply(extract_hashtags)

In [3]:
df.head(50)

Unnamed: 0,id,keyword,location,text,target,hashtags
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,#earthquake
1,4,,,Forest fire near La Ronge Sask. Canada,1,No
2,5,,,All residents asked to 'shelter in place' are ...,1,No
3,6,,,"13,000 people receive #wildfires evacuation or...",1,#wildfires
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,#Alaska #wildfires
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,#RockyFire #CAfire #wildfires
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,#flood #disaster
7,13,,,I'm on top of the hill and I can see a fire in...,1,No
8,14,,,There's an emergency evacuation happening now ...,1,No
9,15,,,I'm afraid that the tornado is coming to our a...,1,No


In [4]:
# Remove # symbol
df['hashtags'] = df['hashtags'].str.replace('#', ' ', regex=False)



In [5]:
df.head(50)

Unnamed: 0,id,keyword,location,text,target,hashtags
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,earthquake
1,4,,,Forest fire near La Ronge Sask. Canada,1,No
2,5,,,All residents asked to 'shelter in place' are ...,1,No
3,6,,,"13,000 people receive #wildfires evacuation or...",1,wildfires
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Alaska wildfires
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,RockyFire CAfire wildfires
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,flood disaster
7,13,,,I'm on top of the hill and I can see a fire in...,1,No
8,14,,,There's an emergency evacuation happening now ...,1,No
9,15,,,I'm afraid that the tornado is coming to our a...,1,No


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
hashtags_vectorized = vectorizer.fit_transform(df['hashtags']).toarray()
hashtags_vectorized

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [7]:
df['text'][36]

"@PhDSquares #mufc they've built so much hype around new acquisitions but I doubt they will set the EPL ablaze this season."

In [8]:
df.head(90)

Unnamed: 0,id,keyword,location,text,target,hashtags
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,earthquake
1,4,,,Forest fire near La Ronge Sask. Canada,1,No
2,5,,,All residents asked to 'shelter in place' are ...,1,No
3,6,,,"13,000 people receive #wildfires evacuation or...",1,wildfires
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Alaska wildfires
...,...,...,...,...,...,...
85,126,accident,,Carolina accident: Motorcyclist Dies in I-540 ...,1,No
86,128,accident,"New Hanover County, NC",FYI CAD:FYI: ;ACCIDENT PROPERTY DAMAGE;NHS;999...,1,No
87,129,accident,Maldives,RT nAAYf: First accident in years. Turning ont...,1,No
88,130,accident,"Manchester, NH",Accident left lane blocked in #Manchester on R...,1,Manchester traffic


In [9]:
df['text'] = df['text'].str.replace(r'@[A-Za-z0-9_]+', ' ', regex=True)

In [10]:
df

Unnamed: 0,id,keyword,location,text,target,hashtags
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,earthquake
1,4,,,Forest fire near La Ronge Sask. Canada,1,No
2,5,,,All residents asked to 'shelter in place' are ...,1,No
3,6,,,"13,000 people receive #wildfires evacuation or...",1,wildfires
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Alaska wildfires
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,No
7609,10870,,,The out of control wild fires in Californi...,1,No
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,No
7611,10872,,,Police investigating after an e-bike collided ...,1,No


In [11]:
#pattern=r'http:\/\/\S+|https:\/\/\S+|[^a-zA-Z\s]+'
pattern=r'http:\/\/\S+|https:\/\/\S+|[^a-zA-Z\s\\]+'


In [12]:
df['text2'] = df['text'].str.replace(pattern, ' ', regex=True)



In [13]:
df

Unnamed: 0,id,keyword,location,text,target,hashtags,text2
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,earthquake,Our Deeds are the Reason of this earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada,1,No,Forest fire near La Ronge Sask Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,No,All residents asked to shelter in place are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,wildfires,people receive wildfires evacuation orders ...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Alaska wildfires,Just got sent this photo from Ruby Alaska as ...
...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,No,Two giant cranes holding a bridge collapse int...
7609,10870,,,The out of control wild fires in Californi...,1,No,The out of control wild fires in Californi...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,No,M UTC km S of Volcano Hawaii
7611,10872,,,Police investigating after an e-bike collided ...,1,No,Police investigating after an e bike collided ...


In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
ps = PorterStemmer()
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Abhishek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [16]:
df['transformed_text'] = df['text2'].apply(transform_text)

In [17]:
df

Unnamed: 0,id,keyword,location,text,target,hashtags,text2,transformed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,earthquake,Our Deeds are the Reason of this earthquake M...,deed reason earthquak may allah forgiv us
1,4,,,Forest fire near La Ronge Sask. Canada,1,No,Forest fire near La Ronge Sask Canada,forest fire near la rong sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,No,All residents asked to shelter in place are ...,resid ask shelter place notifi offic evacu she...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,wildfires,people receive wildfires evacuation orders ...,peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Alaska wildfires,Just got sent this photo from Ruby Alaska as ...,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,No,Two giant cranes holding a bridge collapse int...,two giant crane hold bridg collaps nearbi home
7609,10870,,,The out of control wild fires in Californi...,1,No,The out of control wild fires in Californi...,control wild fire california even northern par...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,No,M UTC km S of Volcano Hawaii,utc km volcano hawaii
7611,10872,,,Police investigating after an e-bike collided ...,1,No,Police investigating after an e bike collided ...,polic investig e bike collid car littl portug ...


In [18]:
df.drop(columns=['keyword','location']) #dropping columns because they are irrelevant

Unnamed: 0,id,text,target,hashtags,text2,transformed_text
0,1,Our Deeds are the Reason of this #earthquake M...,1,earthquake,Our Deeds are the Reason of this earthquake M...,deed reason earthquak may allah forgiv us
1,4,Forest fire near La Ronge Sask. Canada,1,No,Forest fire near La Ronge Sask Canada,forest fire near la rong sask canada
2,5,All residents asked to 'shelter in place' are ...,1,No,All residents asked to shelter in place are ...,resid ask shelter place notifi offic evacu she...
3,6,"13,000 people receive #wildfires evacuation or...",1,wildfires,people receive wildfires evacuation orders ...,peopl receiv wildfir evacu order california
4,7,Just got sent this photo from Ruby #Alaska as ...,1,Alaska wildfires,Just got sent this photo from Ruby Alaska as ...,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1,No,Two giant cranes holding a bridge collapse int...,two giant crane hold bridg collaps nearbi home
7609,10870,The out of control wild fires in Californi...,1,No,The out of control wild fires in Californi...,control wild fire california even northern par...
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,No,M UTC km S of Volcano Hawaii,utc km volcano hawaii
7611,10872,Police investigating after an e-bike collided ...,1,No,Police investigating after an e bike collided ...,polic investig e bike collid car littl portug ...


### Word2Vec

In [25]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

document = df['transformed_text']

# Preprocess the documents
processed_docs = [simple_preprocess(doc) for doc in df['transformed_text']]

# Train the Word2Vec model
model = Word2Vec(sentences=processed_docs, vector_size=100, window=5, min_count=1, workers=4)

# Save the model
model.save("word2vec.model")

# Load the model
model = Word2Vec.load("word2vec.model")

# Find the most similar words to 'machine'
similar_words = model.wv.most_similar('fire', topn=3)
print(similar_words)


[('amp', 0.999611496925354), ('evacu', 0.9995889663696289), ('us', 0.9995428919792175)]


In [24]:
print(model.wv.key_to_index)  # Shows all words in the vocabulary


{'like': 0, 'fire': 1, 'amp': 2, 'get': 3, 'bomb': 4, 'new': 5, 'via': 6, 'one': 7, 'news': 8, 'go': 9, 'peopl': 10, 'kill': 11, 'time': 12, 'burn': 13, 'year': 14, 'video': 15, 'flood': 16, 'crash': 17, 'emerg': 18, 'disast': 19, 'bodi': 20, 'attack': 21, 'build': 22, 'day': 23, 'look': 24, 'home': 25, 'say': 26, 'fatal': 27, 'love': 28, 'polic': 29, 'would': 30, 'make': 31, 'famili': 32, 'evacu': 33, 'still': 34, 'storm': 35, 'train': 36, 'see': 37, 'us': 38, 'got': 39, 'come': 40, 'back': 41, 'know': 42, 'california': 43, 'suicid': 44, 'live': 45, 'bag': 46, 'watch': 47, 'want': 48, 'man': 49, 'world': 50, 'death': 51, 'car': 52, 'collaps': 53, 'scream': 54, 'derail': 55, 'rt': 56, 'first': 57, 'take': 58, 'caus': 59, 'let': 60, 'think': 61, 'nuclear': 62, 'two': 63, 'pm': 64, 'drown': 65, 'today': 66, 'need': 67, 'war': 68, 'work': 69, 'wreck': 70, 'accid': 71, 'dead': 72, 'deton': 73, 'destroy': 74, 'hiroshima': 75, 'hijack': 76, 'gt': 77, 'full': 78, 'plan': 79, 'feel': 80, 'old'