In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("nlp-getting-started/train.csv")

## Exploratory Analysis

In [3]:
data.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
data.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [6]:
data.drop(columns=["id"],inplace=True)

In [7]:
data["target"].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [8]:
data["target"].value_counts(normalize=True)

target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

In [9]:
df = data["location"].value_counts()
print(df.to_string())

location
USA                                                  104
New York                                              71
United States                                         50
London                                                45
Canada                                                29
Nigeria                                               28
UK                                                    27
Los Angeles, CA                                       26
India                                                 24
Mumbai                                                22
Washington, DC                                        21
Kenya                                                 20
Worldwide                                             19
Chicago, IL                                           18
Australia                                             18
California                                            17
New York, NY                                          15
California, USA       

In [10]:
# vc = train["location"].value_counts()

# # Compute differences between consecutive counts
# diffs = vc.diff().fillna(0).abs()

# print(diffs.head(10))

# # Example threshold: drop is more than 50% from previous
# sudden_drops = (vc.shift(1) - vc) / vc.shift(1) > 0.5
# print(vc[sudden_drops])

In [11]:
data.columns

Index(['keyword', 'location', 'text', 'target'], dtype='object')

## Data Splitting

In [12]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(
    data, test_size=0.3, random_state=42, stratify=data["target"]
)
print("Train size:", len(train))
print("Validation size:", len(val))

Train size: 5329
Validation size: 2284


## Preprocessing

In [13]:
train["location"].fillna("unknown",inplace=True)
location_counts = train["location"].value_counts()
train["location"] = train["location"].apply(lambda x:x if location_counts[x]>=10 else "Others")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["location"].fillna("unknown",inplace=True)


In [14]:
train.columns

Index(['keyword', 'location', 'text', 'target'], dtype='object')

In [21]:
train["keyword"].fillna("unknown",inplace=True)
keyword_counts = train["keyword"].value_counts()
keyword_counts

keyword
unknown                  44
famine                   33
harm                     32
windstorm                31
drowned                  31
                         ..
war%20zone               12
radiation%20emergency     8
epicentre                 7
threat                    6
inundation                4
Name: count, Length: 222, dtype: int64

In [15]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Encode location
location_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_location = location_encoder.fit_transform(train[['location']])

# Encode keyword
keyword_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_keyword = keyword_encoder.fit_transform(train[['keyword']])

# Convert back to DataFrames with correct names
encoded_df_location = pd.DataFrame(encoded_location, 
                                   columns=location_encoder.get_feature_names_out(['location']))

encoded_df_keyword = pd.DataFrame(encoded_keyword, 
                                  columns=keyword_encoder.get_feature_names_out(['keyword']))

# Concatenate with original data
encoded_train = pd.concat(
    [train.reset_index(drop=True), encoded_df_location, encoded_df_keyword], 
    axis=1
)

encoded_train.head()

Unnamed: 0,keyword,location,text,target,location_Australia,location_California,"location_California, USA",location_Canada,"location_Chicago, IL",location_Everywhere,...,keyword_whirlwind,keyword_wild%20fires,keyword_wildfire,keyword_windstorm,keyword_wounded,keyword_wounds,keyword_wreck,keyword_wreckage,keyword_wrecked,keyword_nan
0,fatalities,USA,Las Vegas in top 5 cities for red-light runnin...,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,sinking,Others,Do you feel like you are sinking in unhappines...,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,volcano,Others,The Architect Behind Kanye WestÛªs Volcano ht...,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,inundation,unknown,@ZachLowe_NBA there are a few reasons for that...,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,obliterated,Others,I can't wait to be beyond obliterated this wee...,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Example: suppose your dataframe has a column 'text'
# train['text']

# Initialize vectorizer
tfidf_text = TfidfVectorizer(
    stop_words='english',   # remove common stopwords
    ngram_range=(1,2)       # unigrams + bigrams (optional)
)

# Fit and transform
tfidf_text_train = tfidf_text.fit_transform(train['text'].astype(str))

# Convert to DataFrame
tfidf_text_train = pd.DataFrame(tfidf_text_train.toarray(), columns=tfidf_text.get_feature_names_out())

In [24]:
# Concatenate with original data if needed
train = pd.concat([train.reset_index(drop=True), tfidf_df_text], axis=1)

train.head()

Unnamed: 0,target,00,00 11,00 18,00 25,00 52,00 ep,00 epicenter,00 hiroshima,00 http,...,ûó rt,ûó stories,ûóher,ûóher upper,ûónegligence,ûónegligence fireworks,ûótech,ûótech business,ûówe,ûówe work
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
train.drop(columns=["text","location","keyword"], inplace = True)