In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn import feature_extraction, linear_model, preprocessing
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
import spacy
import sys
sys.path.insert(1, '../src')

from nlp_helpers import generate_ngrams

SEED=42

%load_ext autoreload
%autoreload 2

In [2]:
df_train = pd.read_csv('../data/train_clean.csv')
df_test = pd.read_csv('../data/test_clean.csv')

## Simple count vectorizer

In [15]:
count_vectorizer = feature_extraction.text.CountVectorizer()
train_vector = count_vectorizer.fit_transform(df_train['text'])
test_vector = count_vectorizer.transform(df_test['text'])

In [17]:
train_vector.shape

(7552, 17034)

In [19]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

clf = linear_model.LogisticRegression()
scores = cross_val_score(clf, train_vector, df_train['target'], cv=cv, scoring='f1')
scores

array([0.75562219, 0.75310482, 0.75036928])

In [20]:
sample_submission = pd.read_csv('../reports/sample_submission.csv')

In [29]:
train_vector, val_vector, train_target, val_target = train_test_split(train_vector, df_train['target'], test_size=0.1, shuffle=True, stratify=df_train['target'])

In [32]:
mdl = clf.fit(train_vector, train_target)

In [33]:
mdl.score(train_vector, train_target)

0.9624779281930548

In [34]:
mdl.score(val_vector, val_target)

0.791005291005291

In [24]:
sample_submission['target'] = clf.predict(test_vector)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
sample_submission.to_csv('../reports/cnt_vectorized_submission.csv', index=False)

## Count vectorizer with ngrams

In [2]:
df_train = pd.read_csv('../data/train_clean.csv')
df_test = pd.read_csv('../data/test_clean.csv')

In [3]:
STOPWORDS = stopwords.words('english')

In [7]:
def clean_text(text):
    tokens = word_tokenize(text.lower())
    clean_tokens = [token for token in tokens if token not in STOPWORDS and token.isalnum()]
    return ' '.join(clean_tokens)

df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

df_train.to_csv('../data/train_preprocessed.csv', index=False)
df_test.to_csv('../data/test_preprocessed.csv', index=False)

In [8]:
df_train[df_train.text.isna()]

Unnamed: 0,id,keyword,location,text,target,word_count,unique_word_count,mean_word_length,char_count,punctuation_count


In [6]:
txt = '@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.'

generate_ngrams(txt, 2, remove_stopwords=False)


['thetawniest_the',
 'the_out',
 'out_of',
 'of_control',
 'control_wild',
 'wild_fires',
 'fires_in',
 'in_california',
 'california_even',
 'even_in',
 'in_the',
 'the_northern',
 'northern_part',
 'part_of',
 'of_the',
 'the_state',
 'state_very',
 'very_troubling']

In [7]:
#df_train['text'] = df_train['text'].map(lambda x: x + ' '.join(generate_ngrams(x, n=3)))
#df_test['text'] = df_test['text'].map(lambda x: x + ' '.join(generate_ngrams(x, n=3)))

In [27]:
lemma = WordNetLemmatizer()

def LemmaTokenizer(text):
    return [lemma.lemmatize(token) for token in word_tokenize(text)]

count_vectorizer = feature_extraction.text.CountVectorizer(stop_words=STOPWORDS, ngram_range=(1,3))
train_vector = count_vectorizer.fit_transform(df_train['text'])
test_vector = count_vectorizer.transform(df_test['text'])
train_vector.shape

(7552, 110284)

In [28]:
train_vector, val_vector, train_target, val_target = train_test_split(train_vector, df_train['target'], test_size=0.1, shuffle=True, stratify=df_train['target'])

In [29]:
clf = linear_model.LogisticRegression()
mdl = clf.fit(train_vector, train_target)

In [30]:
mdl.score(train_vector, train_target)

0.9883755150088287

In [31]:
mdl.score(val_vector, val_target)

0.8148148148148148

In [20]:
sample_submission = pd.read_csv('../reports/sample_submission.csv')
sample_submission['target'] = clf.predict(test_vector)
sample_submission.to_csv('../reports/ngram_cnt_vectorized_submission.csv', index=False)

## TFIDF

In [60]:
df_train = pd.read_csv('../data/train_clean.csv')
df_test = pd.read_csv('../data/test_clean.csv')

vectorizer = feature_extraction.text.TfidfVectorizer(stop_words=STOPWORDS, ngram_range=(1,1))
train_vector = vectorizer.fit_transform(df_train['text'])
test_vector = vectorizer.transform(df_test['text'])
train_vector.shape

(7552, 16895)

In [61]:
train_vector, val_vector, train_target, val_target = train_test_split(train_vector, df_train['target'], test_size=0.1, shuffle=True, stratify=df_train['target'])

In [62]:
clf = linear_model.LogisticRegression()
mdl = clf.fit(train_vector, train_target)

print('Train score {}'.format(mdl.score(train_vector, train_target)))
print('Test score {}'.format(mdl.score(val_vector, val_target)))

Train score 0.8897881106533255
Test score 0.8108465608465608


In [63]:
sample_submission = pd.read_csv('../reports/sample_submission.csv')
sample_submission['target'] = clf.predict(test_vector)
sample_submission.to_csv('../reports/tfidf_submission.csv', index=False)

## Spacy

In [35]:
nlp = spacy.load("en_core_web_sm")

In [36]:
df_train = pd.read_csv('../data/train_clean.csv')
df_test = pd.read_csv('../data/test_clean.csv')

In [42]:
df_train['text'] = list(nlp.pipe(df_train['text'], n_process=-1))

In [48]:
idx = 42
print([n.pos_ for n in df_train.at[idx,'text']])
print([n.lemma_ for n in df_train.at[idx,'text']])
print([n.text for n in df_train.at[idx,'text']])

['VERB', 'DET', 'ADJ', 'NOUN', 'VERB', 'DET', 'PROPN', 'NOUN', 'NOUN', 'DET', 'NOUN', 'NOUN', 'CCONJ', 'ADJ', 'PUNCT', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'ADP', 'VERB', 'NOUN', 'ADP', 'PRON', 'PUNCT', 'PUNCT']
['have', 'an', 'awesome', 'time', 'visit', 'the', 'CFC', 'head', 'office', 'the', 'ancop', 'site', 'and', 'ablaze', '.', 'thank', 'to', 'Tita', 'Vida', 'for', 'take', 'care', 'of', 'we', '?', '?']
['Had', 'an', 'awesome', 'time', 'visiting', 'the', 'CFC', 'head', 'office', 'the', 'ancop', 'site', 'and', 'ablaze', '.', 'Thanks', 'to', 'Tita', 'Vida', 'for', 'taking', 'care', 'of', 'us', '?', '?']
