In [39]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regexp
import spacy # text lemmatizing
import unicodedata # text deaccentation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


### Data examination

Let's load the data tables and examine some portions of the training DataFrame. We can observe that some location and keyword fields are missing values, and the text fields often contain links, punctuation, stop words, and other elements that require preprocessing.

In [40]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

pd.concat([train_df.head(), train_df[65:75], train_df.tail()])

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
65,93,ablaze,Birmingham,@nxwestmidlands huge fire at Wholesale markets...,1
66,95,ablaze,San Francisco,@ablaze what time does your talk go until? I d...,0
67,96,accident,CLVLND,'I can't have kids cuz I got in a bicycle acci...,0
68,97,accident,"Nashville, TN",Accident on I-24 W #NashvilleTraffic. Traffic ...,1
69,98,accident,"Santa Clara, CA",Accident center lane blocked in #SantaClara on...,1


### Preprocessing

Apply various text preprocessing techniques in this step. For instance, remove all URLs containing http, https, or www, eliminate accents, delete @-mentions, remove digits and special symbols, and lemmatize the text (transform words into their grammatical base forms).

In [41]:
def preprocess_text(text):
    
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r'@\S*', '', text)
    text = re.sub(r'\W|\d', ' ', text)
    
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    text = ' '.join(lemmatized_tokens)
    
    return text

In [42]:
nlp = spacy.load('en_core_web_sm')
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

pd.concat([train_df.head(), train_df[65:75], train_df.tail()])

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deed be the Reason of this earthquake ma...,1
1,4,,,forest fire near La Ronge Sask Canada,1
2,5,,,all resident ask to shelter in place be be...,1
3,6,,,people receive wildfire evacuation o...,1
4,7,,,just got send this photo from Ruby Alaska as...,1
65,93,ablaze,Birmingham,huge fire at wholesale market ablaze,1
66,95,ablaze,San Francisco,what time do your talk go until I don t kn...,0
67,96,accident,CLVLND,I can t have kid cuz I get in a bicycle acci...,0
68,97,accident,"Nashville, TN",accident on I W NashvilleTraffic Traff...,1
69,98,accident,"Santa Clara, CA",accident center lane block in SantaClara on ...,1


### Vectorization

In this example, Term Frequency-Inverse Document Frequency (TF-IDF) vectorization is employed for the tweet texts. Two supplementary features, keyword and location, are incorporated into the model using the One-Hot Encoding technique. The ColumnTransformer is utilized to apply both transformers to multiple columns simultaneously.

In [43]:
tfidf = TfidfVectorizer()
keyword_encoder = OneHotEncoder(handle_unknown='ignore')
location_encoder = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('text_tfidf', tfidf, 'text'),
        ('location_ohe', location_encoder, ['location'])
    ])


### Training and testing

The C-Support Vector Classification model was selected to predict the 'target' values. Along with the preprocessor ColumnTransformer, the classifier object was integrated into a pipeline. The model's F1 score was evaluated using the cross-validation technique, yielding an estimated score of 0.68, which indicates a reasonably good performance.

In [44]:
clf = SVC(kernel='linear', probability=True)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf)
])


scores = cross_val_score(pipeline, train_df[['text', 'keyword', 'location']], train_df['target'], cv=5, scoring='f1', error_score='raise')

scores

array([0.6344464 , 0.58656126, 0.63650075, 0.59863946, 0.73181484])

Submit the task.

In [49]:
pipeline.fit(train_df[['text', 'keyword', 'location']], train_df['target'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('text_tfidf',
                                                  TfidfVectorizer(), 'text'),
                                                 ('location_ohe',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['location'])])),
                ('classifier', SVC(kernel='linear', probability=True))])

In [50]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = pipeline.predict(test_df[['text', 'keyword', 'location']])
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [51]:
sample_submission.to_csv("submission.csv", index=False)