In [1]:
import pandas as pd
import numpy as np
from nltk.stem import *
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
import sklearn.feature_extraction.text as text
from sklearn import decomposition, tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit

import re

stemmer=PorterStemmer()
stop_words = stopwords.words('english')

news = pd.read_csv('C:/Programming/Python Scripts/Text Analytics/Email News/Data/news.csv')
news.head()

Unnamed: 0,TEXT,graphics,hockey,medical,newsgroup,key
0,I have a few reprints left of chapters from my...,1,0,0,graphics,1
1,"gnuplot, etc. make it easy to plot real valued...",1,0,0,graphics,2
2,Article-I.D.: snoopy.1pqlhnINN8k1 References: ...,1,0,0,graphics,3
3,"Hello, I am looking to add voice input capabil...",1,0,0,graphics,4
4,I recently got a file describing a library of ...,1,0,0,graphics,5


### See first 5 reviews complete

In [2]:
for i in range(5):
    print(news['TEXT'][i], "\n")

I have a few reprints left of chapters from my book " Visions of the Future" . These include reprints of 3 chapters probably of interest to readers of this forum, including: 1. Current Techniques and Development of Computer Art, by Franz Szabo 2. Forging a Career as a Sculptor from a Career as Computer Programmer, by Stewart Dickson 3. Fractals and Genetics in the Future by H. Joel Jeffrey I'd be happy to send out free reprints to researchers for scholarly purposes, until the reprints run out. Just send me your name and address. Thanks, Cliff cliff@watson.ibm.com 

gnuplot, etc. make it easy to plot real valued functions of 2 variables but I want to plot functions whose values are 2-vectors. I have been doing this by plotting arrays of arrows (complete with arrowheads) but before going further, I thought I would ask whether someone has already done the work. Any pointers?? thanx in advance Tom Weston | USENET: weston@ucssun1.sdsu.edu Department of Philosophy | (619) 594-6218 (office) S

### Prepare Data For Analysis
Strip Punctuation, Capitalization, and Stemming

In [3]:
news['TEXT']=[stemmer.stem(re.sub(r'[^\w\s\@]','',text).lower()) for text in news['TEXT']]
for i in range(3):
    print(news['TEXT'][i], "\n")

i have a few reprints left of chapters from my book  visions of the future  these include reprints of 3 chapters probably of interest to readers of this forum including 1 current techniques and development of computer art by franz szabo 2 forging a career as a sculptor from a career as computer programmer by stewart dickson 3 fractals and genetics in the future by h joel jeffrey id be happy to send out free reprints to researchers for scholarly purposes until the reprints run out just send me your name and address thanks cliff cliff@watsonibmcom 

gnuplot etc make it easy to plot real valued functions of 2 variables but i want to plot functions whose values are 2vectors i have been doing this by plotting arrays of arrows complete with arrowheads but before going further i thought i would ask whether someone has already done the work any pointers thanx in advance tom weston  usenet weston@ucssun1sdsuedu department of philosophy  619 5946218 office san diego state univ  619 5757477 home 

### Generate Document-Term Matrix

In [4]:
#ignore words with frequency of less than 15
vectorizer = text.CountVectorizer(input='files', stop_words='english', min_df=15)
dtm = vectorizer.fit_transform(news['TEXT']).toarray()
vocab = np.array(vectorizer.get_feature_names())

#1113 documents, 362 words with counts per document
dtm.shape

(598, 396)

### Perform topic modeling

In [5]:
n_topics=10
n_top_words=5
clf = decomposition.NMF(n_components=n_topics, random_state=12345)
doctopic = clf.fit_transform(dtm) #New Variables

### See top words for each topic

In [6]:
topic_words=[]
for topic in clf.components_:
    word_idx = np.argsort(topic)[::-1][0:n_top_words]
    topic_words.append([vocab[i] for i in word_idx])
topic_words

[['period', 'play', 'power', 'second', 'pittsburgh'],
 ['dont', 'think', 'just', 'like', 'im'],
 ['flyers', 'got', 'play', 'leafs', 'power'],
 ['season', 'game', 'team', 'points', 'mark'],
 ['program', 'available', 'information', 'software', 'use'],
 ['day', 'want', 'people', 'medical', 'help'],
 ['33', '21', '23', '31', '10'],
 ['graphics', 'systems', 'ca', 'computer', 'new'],
 ['article', 'writes', 'gordon', 'banks', 'geb'],
 ['new', 'win', 'games', 'team', 'nhl']]

### See Structured Representation of the text (Components)

In [7]:
#clf.components_ weightings
components=pd.DataFrame(doctopic)
components.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.012505,0.004332,0.0,0.085409,0.019131,0.0,0.097338,0.034787,0.0
1,0.010896,0.032356,0.004901,0.0,0.004177,0.039762,0.0,0.072043,0.022277,0.013173
2,0.0,0.324707,0.009982,0.0,0.115315,0.024129,0.0,0.0,0.172363,0.0
3,0.0,0.0,0.0,0.0,0.043645,0.0,0.0,0.287995,0.004636,0.0
4,0.0,0.068362,0.054789,0.0,0.129886,0.0,0.0,0.0,0.0,0.0


### Prep Decision Tree Model

In [8]:
#Create One Outcome Variable
outcome_l=[]
for response in news['newsgroup']:
    if response=='graphics':
        outcome=1
    elif response=='hockey':
        outcome=2
    else:
        outcome=3
    outcome_l.append(outcome)
news['cat_resp']=outcome_l

components['cat_resp']=news['cat_resp']

#Stratified Random Sample
splits = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
stratification = list(splits.split(doctopic, news['cat_resp']))

components['split']=(components.index.isin(stratification[0][0])).astype(int)

x_train=components[list(range(10))][components['split']==1]
x_val=components[list(range(10))][components['split']==0]
y_train=components['cat_resp'][components['split']==1]
y_val=components['cat_resp'][components['split']==0]

### Decision Tree Model

In [9]:
dt = tree.DecisionTreeClassifier(max_depth=5)
dt.fit(x_train, y_train)
valid_dt_pred=dt.predict(x_val)

#Score Validation and get accuracy
accuracy=100*round(accuracy_score(y_val, valid_dt_pred),3)
print(accuracy, '- Decision Tree Validation')

73.3 - Decision Tree Validation


### Score whole model

In [10]:
components['dt_pred'] = dt.predict(components[list(range(10))])
components['correct_class']=(components['cat_resp']==components['dt_pred']).astype(int)

#Overall Accuracy
components['correct_class'].mean()

0.7792642140468228

### Accuracy by class

In [11]:
components['newsgroup']=news['newsgroup']
components.groupby(['newsgroup'])['correct_class'].mean()

newsgroup
graphics    0.843434
hockey      0.825000
medical     0.670000
Name: correct_class, dtype: float64

In [12]:
components.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,cat_resp,split,dt_pred,correct_class,newsgroup
0,0.0,0.012505,0.004332,0.0,0.085409,0.019131,0.0,0.097338,0.034787,0.0,1,1,1,1,graphics
1,0.010896,0.032356,0.004901,0.0,0.004177,0.039762,0.0,0.072043,0.022277,0.013173,1,0,2,0,graphics
2,0.0,0.324707,0.009982,0.0,0.115315,0.024129,0.0,0.0,0.172363,0.0,1,1,1,1,graphics
3,0.0,0.0,0.0,0.0,0.043645,0.0,0.0,0.287995,0.004636,0.0,1,1,1,1,graphics
4,0.0,0.068362,0.054789,0.0,0.129886,0.0,0.0,0.0,0.0,0.0,1,1,1,1,graphics
