In [32]:
# Use GPU
import os
os.environ['HSA_OVERRIDE_GFX_VERSION'] = '10.3.0'

import pandas as pd
import numpy as np
from pandasql import sqldf

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

from nltk.tokenize import TweetTokenizer

In [14]:
df = pd.read_csv('./data/train.csv')

In [15]:
display(df.info())
display(df.head(3))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


None

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [16]:
# change target column to something more informative
df['target'] = df['target'].map({1: True, 0: False})

# drop the id column
df = df.drop(columns=['id'])

df.head(3)

Unnamed: 0,keyword,location,text,target
0,,,Our Deeds are the Reason of this #earthquake M...,True
1,,,Forest fire near La Ronge Sask. Canada,True
2,,,All residents asked to 'shelter in place' are ...,True


In [18]:
display("Missing values")
display(df.isna().sum())
df[df['location'].isna()]

'Missing values'

keyword       61
location    2533
text           0
target         0
dtype: int64

Unnamed: 0,keyword,location,text,target
0,,,Our Deeds are the Reason of this #earthquake M...,True
1,,,Forest fire near La Ronge Sask. Canada,True
2,,,All residents asked to 'shelter in place' are ...,True
3,,,"13,000 people receive #wildfires evacuation or...",True
4,,,Just got sent this photo from Ruby #Alaska as ...,True
...,...,...,...,...
7608,,,Two giant cranes holding a bridge collapse int...,True
7609,,,@aria_ahrary @TheTawniest The out of control w...,True
7610,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,True
7611,,,Police investigating after an e-bike collided ...,True


**Observation:**
- We can see that locations and keywords are not present for many obvious tweets. Lots of cleaning could fix this.

### Baseline-run
- Before we try in-depth cleaning and processing, lets see what kind of performance we can get using simply the existing text column

In [25]:
def tokenizer(text):
    tok = TweetTokenizer()
    return tok.tokenize(text)

In [26]:
vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words='english', tokenizer=tokenizer)
X = vectorizer.fit_transform(df['text'])
y = df['target']

In [35]:
models = [
    LogisticRegression(random_state=42),
    RandomForestClassifier(random_state=42),
    XGBClassifier(random_state=42)
]

params = [
    {
    },
    {  
    },
    {
    }
]

for model, param in list(zip(models,params)):
    gridcv = GridSearchCV(model, param_grid=param, n_jobs=-1, cv=5)
    gridcv.fit(X,y)
    print(gridcv.best_estimator_)
    print(gridcv.best_score_)

LogisticRegression(random_state=42)
0.7059055929967395
RandomForestClassifier(random_state=42)
0.6683381319979327
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
0.6457426771112758


### Data processing
- Lets begin by trying to handle locations. We will do this by simply looking at the most observed locations.

In [44]:
text_data = df['text'].value_counts()
text_data

text
11-Year-Old Boy Charged With Manslaughter of Toddler: Report: An 11-year-old boy has been charged with manslaughter over the fatal sh...        10
He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam                      6
The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'                               6
#Bestnaijamade: 16yr old PKK suicide bomber who detonated bomb in ... http://t.co/KSAwlYuX02 bestnaijamade bestnaijamade bestnaijamade beÛ_     6
Madhya Pradesh Train Derailment: Village Youth Saved Many Lives                                                                                  5
                                                                                                                                                ..
Fotoset: elanorofrohan: 10th December 2013 Green Carpet in Zurich for the Swiss Premiere of The Desolation... htt