In [1]:
from collections import Counter

import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB

import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer() 
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/My Drive/train.csv")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def preprocess_text(tokenizer, lemmatizer, stop_words, punctuation, text): 
    tokens = tokenizer(text.lower())
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return [token for token in lemmas if token not in stop_words and token not in punctuation and len(token) > 4 and len(token) < 20]

df['cleaned'] = df.text.apply(lambda x: preprocess_text(word_tokenize, lemmatizer, stop_words, punctuation, x))

In [4]:
def flat_nested(nested):
    flatten = []
    for item in nested:
        if isinstance(item, list):
            flatten.extend(item)
        else:
            flatten.append(item)
    return flatten

cnt_vocab = Counter(flat_nested(df.cleaned.tolist()))

print("Vocab size before filtering: {}".format(len(cnt_vocab)))

threshold_count_l = 1
threshold_count_h = 10000
threshold_len = 1

cleaned_vocab = [token for token, count in cnt_vocab.items() if 
                     threshold_count_h > count > threshold_count_l and len(token) > threshold_len
                ]
print("Vocab size after filtering: {}".format(len(cleaned_vocab)))
cleaned_vocab.append(" ")
cleaned_vocab = set(cleaned_vocab)

def filter_noise_tokens(df, cleaned_vocab): 
    df['filtered_tokens'] = df.cleaned.apply(lambda x: [tok for tok in x if tok in cleaned_vocab])
    return df
df = filter_noise_tokens(df, cleaned_vocab)
df_filtered = df[df.astype(str)['filtered_tokens'] != '[]']

Vocab size before filtering: 3143
Vocab size after filtering: 1021


In [5]:
df_filtered.head()

Unnamed: 0,example_id,text,label,cleaned,filtered_tokens
0,140d03eabb7cb5c2558605eb8336689c,brandpost best of both worlds hybrid onsite an...,0,"[brandpost, world, hybrid, onsite, cloudbased,...","[world, protection]"
1,f7f1f906c9e2b76e63020f8794516185,$mention$ they shall in all cases except treas...,0,"[mention, shall, except, treason, felony, brea...","[mention, treason, breach, peace, arrest]"
2,39f0b2ebc12e008a7a43ec318d0c3874,lifelock offers to protect you from the equifa...,0,"[lifelock, offer, protect, equifax, breach, se...","[lifelock, offer, protect, equifax, breach, se..."
3,ad4e57c69f00548253cb6d47b15c3ce4,skimmer adware spent two months in google play...,1,"[skimmer, adware, spent, month, google, checkp...","[skimmer, adware, month, google, researcher, m..."
5,f9f0419dd6ec37b9f72a8a8292a37d0b,cyber attack on barts nhs trust eloited zeroda...,1,"[cyber, attack, trust, eloited, zeroday, vulne...","[cyber, attack, trust, eloited, zeroday, vulne..."


In [0]:
df_train, df_test = train_test_split(df_filtered, test_size=0.3, random_state = 10)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(vocabulary=cleaned_vocab,ngram_range=(1,3))
x_train = vectorizer.fit_transform(df_train.filtered_tokens.str.join(' '))
x_test = vectorizer.fit_transform(df_test.filtered_tokens.str.join(' '))
y_train = df_train['label']
y_test = df_test['label']

In [0]:
classifier = MultinomialNB()
classifier.fit(x_train.toarray(),y_train)

predictions = classifier.predict(x_test.toarray())

In [9]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       224
           1       0.73      0.64      0.69       132

    accuracy                           0.78       356
   macro avg       0.77      0.75      0.76       356
weighted avg       0.78      0.78      0.78       356



In [0]:
test = pd.read_csv("/content/drive/My Drive/test.csv")
test['cleaned'] = test.text.apply(lambda x: preprocess_text(word_tokenize, lemmatizer, stop_words, punctuation, x))
test = filter_noise_tokens(test, cleaned_vocab)
x_test = vectorizer.fit_transform(test.filtered_tokens.str.join(' '))
predictions = classifier.predict(x_test.toarray())

In [0]:
test['label'] = list(map(bool,predictions))
subm = test.drop(['text','cleaned', 'filtered_tokens'],axis=1)
subm.to_csv("/content/drive/My Drive/subm.csv",index=False)