In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import regex as re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
import string
from sklearn.naive_bayes import MultinomialNB

In [3]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
#Make vocabulary
def get_vocab(train,test):
    vocab = set(' '.join(test['text'].array.to_numpy()).split()).union(set(' '.join(train['text'].array.to_numpy()).split()))
    #Take out anything that doesn't have at least one letter or dash, or any links
    removed = {x for x in vocab if (not bool(re.match(r'^[a-zA-Z-]+[0-9]*$',x))) or x.startswith('http')}
    filtered = vocab - removed
    filtered = {f.strip(string.punctuation) for f in filtered}
    print(f'vocab size before filtering: {len(vocab)} \n vocab size after: {len(filtered)}')
    return filtered

In [6]:
v = get_vocab(df_train,df_test)

vocab size before filtering: 41747 
 vocab size after: 18557


In [7]:
X = df_train['text'].array.to_numpy()
X.shape

(7613,)

In [8]:
Y = df_train['target'].array.to_numpy()
Y.shape

(7613,)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=47)
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((5709,), (1904,), (5709,), (1904,))

In [10]:
# Pipeline with linear model
stop = set(stopwords.words('english')).union(set(string.punctuation))
classifier_linear = Pipeline([
    ('vect', CountVectorizer(vocabulary=v, stop_words=stop)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='log',random_state=47)),
])

In [11]:
classifier_linear.fit(X_train,y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words={'t', 'havi...dom_state=47, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [12]:
#Test
predicted = classifier_linear.predict(X_test)

In [13]:
np.mean(predicted == y_test)

0.8067226890756303

# Cleaning had almost no effect on accuracy (still ~80%). Now to use a different model:

In [14]:
stop = set(stopwords.words('english')).union(set(string.punctuation))
classifier_two = Pipeline([
    ('vect', CountVectorizer(vocabulary=v, stop_words=stop)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
classifier_two.fit(X_train,y_train)
predicted = classifier_two.predict(X_test)
np.mean(predicted == y_test)

0.7993697478991597

Still not that good, now to use clustering(SVM)

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)