# Text analysis examples
In this practicum, we learn to manipulate textual data and perform simple natural language processing tasks including text preprocessing, topic modeling and classification.

## Load data

In [2]:
import pandas as pds
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
!pip install transformers
from transformers import BertTokenizer
from transformers import DistilBertTokenizer, DistilBertModel
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from sklearn.linear_model import LogisticRegression
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
import sklearn
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as text
import torch
from nltk.tree import Tree

import spacy
from tqdm import tqdm

import sklearn.naive_bayes as nb
nltk.download('wordnet')



!wget https://raw.githubusercontent.com/msoley/DSCI549/master/In-class%20exercises/Practicum4/tweet_global_warming.csv
dataset= './tweet_global_warming.csv'
#source https://www.figure-eight.com/data-for-everyone/




ImportError: ignored

### NLTK
In this section we tokenize and parse a sentence.

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
example_text = "Penguins are flightless birds."
tokenizer.tokenize(example_text)

In [None]:

lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize('brought',pos='v')

In [None]:
# draw parse tree
tree_str = "(ROOT (S (NP (NNS Penguins)) (VP (VBP are) (NP (JJ flightless) (NNS birds))) (. .)))"
this_tree = Tree.fromstring(tree_str)
this_tree.pretty_print()

### Spacy
In this secton we use Spacy, a popular natural language processing toolbox, to tokenize and tag a the same sentence. We then look a the stop words

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(example_text)
print(example_text)
print()
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_stop)

In [None]:
en_model = spacy.load('en_core_web_sm')
#  list of default stop words in spaCy english model
stopwords = en_model.Defaults.stop_words
stopwords

## Embeddings
In this section we look at word embeddings and what we can do with them to represent language.


In [None]:
api.info()['models']

In [None]:
# download the model
model_glove_twitter = api.load("glove-twitter-25")

In [None]:
model_glove_twitter['cat']

In [None]:
model_glove_twitter.most_similar("cat",topn=10)


In [None]:
model_glove_twitter.doesnt_match(["dog","cat","apple"])



```
# This is formatted as code
```

## Data loading and preprocessing steps

In [None]:
dataset
data = pds.read_csv(filepath_or_buffer=dataset, encoding = "ISO-8859-1", on_bad_lines = 'warn')
data.loc[31]['tweet']
tweets = data['tweet'].tolist()
# replace links
tweet_normed = [re.sub(r"http:\/\/.*", "[link]", x) for x in tweets]
print(tweets[1091])
print(tweet_normed[1091])
# replace twitter handles
tweet_normed = [re.sub(r'@[A-Za-z0-9_-]* ', '[twitter_handle] ', x) for x in tweet_normed]
print(tweets[31])
print(tweet_normed[31])
# remove extra spaces
tweet_normed = [re.sub(r"\s+", ' ', x) for x in tweet_normed]
print(tweets[31])
print(tweet_normed[31])

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(tweet_normed[31])
print(tweet_normed[31])
print()
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_stop)

In [None]:
doc = nlp(tweet_normed[31])
print(tweet_normed[31])
print()
for token in doc:
    print(token.text, token.dep_)
print()

In [None]:
doc = nlp(tweet_normed[1091])
print(tweet_normed[1091])
print()
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
print(tweet_normed[31])
tokenizer.tokenize(tweet_normed[31])

In [None]:
doc = nlp(tweet_normed[1091].lower())
print(tweet_normed[1091].lower())
print()
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

*Not perfect, consider using TweetNLP!*
http://www.cs.cmu.edu/~ark/TweetNLP/

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
#this_tweet = tweets[1061]
this_tweet = "penguins are flightless birds."
print(this_tweet)
tokenized = bert_tokenizer.tokenize(this_tweet)
for token in tokenized:
    print(token)

In [None]:
doc = nlp(this_tweet)
print(this_tweet)
print()
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_stop)

## Topic Modeling

In [None]:
corpus = [nlp(tweet) for tweet in tweets]
tokens = [[token.text for token in doc] for doc in corpus]

In [None]:
dct = Dictionary(tokens)
docs = [dct.doc2bow(line) for line in tokens]
lda = LdaModel(docs, num_topics=15, id2word=dct, passes=10)

In [None]:
for topic in range(15):
    for x in lda.get_topic_terms(topic, 10):
        print(topic, dct[x[0]])
    print()

In [None]:
idx = 30
print(tweets[idx])
doc_bow = docs[idx]
print(doc_bow)
doc_lda = lda[doc_bow]
print(doc_lda)

In [None]:
from transformers import BertModel
bert = BertModel.from_pretrained('bert-base-uncased')

In [None]:
inputs = bert_tokenizer("penguins are flightless birds.", return_tensors="pt")
outputs = bert(**inputs)

In [None]:
outputs.last_hidden_state

In [None]:
outputs.last_hidden_state.shape

In [None]:
inputs

In [None]:
for x in inputs['input_ids']:
    print(bert_tokenizer.convert_ids_to_tokens(x))

In [None]:
#help(bert_tokenizer)

## Small ML example


In [None]:
data['tweet_normed'] = tweet_normed

yes_tweets = data[data['existence'].isin(["Yes", "Y"])]['tweet_normed'].tolist()
no_tweets = data[data['existence'].isin(["No", "N"])]['tweet_normed'].tolist()
len(yes_tweets),len(no_tweets)

In [None]:
# we encode climate deniers as 1
import numpy as np
def shuffle_lists(a,b):
    if len(a)!=len(b):
        raise Exception("Sorry, list should be of the same length")
    c = list(range(len(a)))
    np.random.shuffle(c)
    a_shuffled = []
    b_shuffled = []
    for i in c:
        a_shuffled.append(a[i])
        b_shuffled.append(b[i])
    return a_shuffled, b_shuffled
train_set = yes_tweets[:2500]+no_tweets[:950]
y_train = [0]*2500+[1]*950

train_set, y_train = shuffle_lists(train_set, y_train)

test_set = yes_tweets[2500:]+no_tweets[950:]
y_test = [0]*611+[1]*164

test_set, y_test = shuffle_lists(test_set, y_test)


In [None]:


tf = text.TfidfVectorizer()
x_train_tfidf = tf.fit_transform(train_set)
x_test_tfidf = tf.transform(test_set)
print(x_train_tfidf.shape)
print(x_test_tfidf.shape)



In [None]:
#estimate sparsity
p = 100 * x_train_tfidf.nnz / float(x_train_tfidf.shape[0] * x_train_tfidf.shape[1])
print(f"Each sample has ~{p:.2f}% non-zero features.")

In [None]:
#bernouli naive bayes
bnb = ms.GridSearchCV(
    nb.BernoulliNB(),
    param_grid={'alpha': np.logspace(-2., 2., 50)})
bnb.fit(x_train_tfidf, y_train)
bnb.score(x_test_tfidf, y_test)

In [None]:
#Logistic regression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
lrg = ms.GridSearchCV(
    LogisticRegression(),
    param_grid=grid,scoring='f1_micro')
lrg.fit(x_train, y_train)
lrg.score(x_test, y_test)

In [None]:
#let's test on a made up sample

#test_text = 'The whole climate crisis is not only Fake News, it’s Fake Science. There is no climate crisis, there’s weather and climate all around the world, and in fact carbon dioxide is the main building block of all life. tweeter_hanldle'
#test_text = "Global warming is endangering our livelihoods."
#test_text = "The whole country is freezing; this must be due to global warming."
test_text = "I don't believe in climate change"
print(test_text)
print("The output lablel with tf-idf is (0: beleiver 1: denier) {:d}".format(int(bnb.predict(tf.transform([test_text])))))
bert_feat = np.expand_dims(extract_bert([test_text]).cpu().detach().numpy(),axis=0)
print("The output lablel with distill bert is (0: beleiver 1: denier) {:d}".format(int(lrg.predict(bert_feat))))

0: beleiver 1: denier