## Visualizing scikit-learn text classification weights
---

pip install lightning-python

conda install -c conda-forge sklearn-contrib-lightning

In [2]:
import scattertext as st
import spacy
from pprint import pprint
import en_core_web_sm

#CSV
import csv
from collections import Counter

#pandas
import pandas as pd

#Matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

#numpy
import numpy as np

# nltk
import nltk
# stopwords, FreqDist, word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

#regular expression
import re

#seaborn
import seaborn as sns

#import packages for scatter text
import scattertext as st
import spacy
from pprint import pprint
import en_core_web_sm

#SKlearn packages
import sklearn
from lightning.classification import CDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# feature engineering (words to vectors)
from sklearn.feature_extraction.text import TfidfVectorizer
# classification algorithms (or classifiers)
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
# build a pipeline
from sklearn.pipeline import Pipeline
# model evaluation, validation
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV 
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
#pip install scikit-plot 
import scikitplot as skplt

## Text Classification Weights for Debate Data

#### First we need to build a text classification model. We will use techniques learned in class. Note we are going to use all of the speakers in this model not just Trump and Hillary. 

In [4]:
#read and format data
df5 = pd.read_csv("data/debate.csv", encoding = 'iso-8859-1')
del df5['Line']
del df5['Date']
df7 = df5[df5.Speaker=="Clinton"].copy()
df8 = df5[df5.Speaker=="Trump"].copy()
df5= df7.append(df8)

Labels are set so Trump = 1 and Clinton = 0

In [5]:
df5['Speaker'] = df5['Speaker'].str.replace('Trump', '0')
df5['Speaker'] = df5['Speaker'].str.replace('Clinton', '1')

In [6]:
#target_names is a list of the two categories which is a class-label list.
target_names = ['Trump', 'Clinton']

In [7]:
df5 = df5.rename(columns={'Speaker': 'Label', 'Text': 'Data'})

In [8]:
#create seperate dataframes for data and labels. 
sms = df5.copy()
sms_data = df5['Data'].copy()
sms_labels = df5['Label'].copy()

#use .values.tolist() to transform dataframe into list
sms = sms.values.tolist()
sms_data = sms_data.values.tolist()
sms_labels = sms_labels.values.tolist()

In [9]:
#Create training and testing sets
x_train, x_test, y_train, y_test = train_test_split(sms_data, sms_labels, test_size=0.2, random_state=0)
len(x_train), len(y_train), len(x_test), len(y_test)

(478, 478, 120, 120)

In [10]:
#y_train must be an array for this
import numpy as np
y_train1 = np.asarray(y_train)

In [11]:
#create vectors
vectorizer = TfidfVectorizer(decode_error ='ignore', stop_words='english')
tfidf_X = vectorizer.fit_transform(x_train)
count_vectorizer = CountVectorizer(vocabulary=vectorizer.vocabulary_, decode_error ='ignore')

In [12]:
corpus = st.CorpusFromScikit(X=count_vectorizer.fit_transform(x_train), 
                             y=y_train1, 
                             feature_vocabulary=vectorizer.vocabulary_,
                             category_names=target_names,
                             raw_texts=x_train).build()

In [13]:
clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   C=1.0 / tfidf_X.shape[0],
                   tol=1e-3)
clf.fit(tfidf_X, y_train1)
term_scores = clf.coef_[0]

In [14]:
html = st.produce_frequency_explorer(corpus, 'Clinton', scores=term_scores,
                                     use_term_significance=True,
                                     terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, term_scores, 4000),
                                     metadata = y_train1)
open("class.html", 'wb').write(html.encode('utf-8'))

  freq_mat[:, cat_i] = self._X[self._y == cat_i, :].sum(axis=0)


938913

In [16]:
from IPython.display import IFrame
from IPython.core.display import display, HTML
file_name = 'class.html'
file = open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1500, height=700)

## Example from Scattertext Git Hub

In [17]:
from lightning.classification import CDClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import scattertext as st

newsgroups_train = fetch_20newsgroups(
	subset='train',
	remove=('headers', 'footers', 'quotes')
)

In [22]:
vectorizer = TfidfVectorizer()
tfidf_X = vectorizer.fit_transform(newsgroups_train.data)
count_vectorizer = CountVectorizer(vocabulary=vectorizer.vocabulary_)

In [23]:
corpus = st.CorpusFromScikit(
	X=count_vectorizer.fit_transform(newsgroups_train.data),
	y=newsgroups_train.target,
	feature_vocabulary=vectorizer.vocabulary_,
	category_names=newsgroups_train.target_names,
	raw_texts=newsgroups_train.data
).build()

In [24]:
clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   C=1.0 / tfidf_X.shape[0],
                   tol=1e-3)
clf.fit(tfidf_X, newsgroups_train.target)
term_scores = clf.coef_[0]

In [25]:
html = st.produce_frequency_explorer(
	corpus,
	'alt.atheism',
	scores=term_scores,
	use_term_significance=False,
	terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, term_scores, 4000),
	metadata = ['/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames]
)

open("class.html", 'wb').write(html.encode('utf-8'))

16631073

In [26]:
from IPython.display import IFrame
from IPython.core.display import display, HTML
file_name = 'class.html'
file = open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1500, height=700)