<a href="https://colab.research.google.com/github/mbeck33/dsportfolio/blob/master/SOTUS2HardTryNewsGroups_ClassExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample NLP Code for IST 718



In [None]:
# A quick sample for using Universal Language Model Fine-Tuning for Text Classification
# aka ULMFiT
# Check out nlp.fast.ai for more

# import libraries
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

In [None]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

In [None]:
# OBTAIN

from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
# OBTAIN

df = pd.DataFrame({'label':dataset.target, 'text':dataset.data})

In [None]:
df.shape

(11314, 2)

Let's try looking at just Politics and Windows articles - source?  https://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.data.html


In [None]:
# SCRUB

df = df[df['label'].isin([5,18])]
df = df.reset_index(drop = True)

In [None]:
df['label'].value_counts()


5     593
18    465
Name: label, dtype: int64

In [None]:
# SCRUB - TEXT

df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")


In [None]:
# SCRUB - STOPWORDS
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# SCRUB - TOKEN

# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

In [None]:
# SCRUB - TEST & TRAIN

from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)

In [None]:
# SCRUB - CHECK SPLIT

df_trn.shape, df_val.shape

((634, 2), (424, 2))

In [None]:
# MODEL

# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [None]:
# MODEL - TUNE PRE-TRAINED MODEL

learn = language_model_learner(data_lm, AWD_LSTM,pretrained=True, drop_mult=0.7)

Downloading https://s3.amazonaws.com/fast-ai-modelzoo/wt103-fwd.tgz


In [None]:
# MODEL - LEARNING RATE

# train the learner object with learning rate = 1e-2
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,6.853075,5.499538,0.200233,00:05


In [None]:
# MODEL - SAVE

learn.save_encoder('ft_enc')

In [None]:
# MODEL - BUILD CLASSIFIER

learn = text_classifier_learner(data_clas, AWD_LSTM,drop_mult=0.7)
learn.load_encoder('ft_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (634 items)
x: TextList
y: CategoryList
5,18,5,5,18
Path: .;

Valid: LabelList (424 items)
x: TextList
xxbos xxmaj there need include xxup vat previous levels xxup vat difference tax xxup vat x amount tax government receives levels x xxunk price end consumer xxmaj at xxunk levels difference xxup vat paid xxup vat received xxunk government a xxup vat xxunk preferable xxunk xxunk tax,xxbos xxmaj we might better former xxunk done nothing,xxbos xxmaj you xxunk communists i think xxmaj in practice communism solved problem killing anybody productive therefore raises xxunk questions rest group bunch xxunk xxmaj the mass xxunk xxunk xxup xxunk good instance a poor second best neighboring capitalist country people politically incorrect skill xxunk xxunk i often wonder xxmaj xxunk would done xxmaj xxunk presently xxmaj xxunk would forced remain xxmaj xxunk xxmaj would xxunk killed killed xxmaj best build wall locking citizens country load heav

In [None]:
# MODEL - FIT

learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.335721,0.25358,0.889151,00:08


In [None]:
# MODEL - EVALUATE

preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,234,43
1,4,143


tensor([1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
        1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
        1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
        0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
        0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
        0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
        0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,