In [86]:
from app import db
from app.models import *
from app.build_models import get_decision_text, get_vote_history, vote_map

import pandas as pd
import numpy as np
import re
import json
from scipy.sparse import csr_matrix
import pickle

from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from datetime import datetime
import nltk
from nltk.stem.snowball import SnowballStemmer
from textblob import TextBlob
stemmer = SnowballStemmer('english')
stopwords = nltk.corpus.stopwords

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import mpld3
sns.set_palette('viridis')
%matplotlib notebook

In [75]:
members = db.session.query(Member).all()

In [78]:
mem = members[123]
memid = mem.id
mem.display_name

'Martinez (R-FL)'

In [80]:
vectorizer = pickle.load(open(mem.vectorizer_path,'rb'))
nn_model = pickle.load(open(mem.nn_model_path,'rb'))

In [88]:
text = get_decision_text(memid)
df = pd.DataFrame(text,columns=['title','question','subject'])
df['vote'] = get_vote_history(memid)
df['body'] = df['title'].str.cat(df['question'],sep=' ').str.cat(df['subject'],sep=' ')
df['body'] = df['body'].apply(lambda x: re.sub("\d+", "", x))
df['result'] = df['vote'].apply(vote_map)

In [103]:
yay = df[df['result']==1]
yay_matrix = vectorizer.fit_transform(yay['body'])
kmeans = KMeans(8)
kmeans.fit_transform(yay_matrix)
yay['cluster'] = kmeans.labels_.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [104]:
yay['cluster'].value_counts()

0    355
3    231
2    123
7     68
6     61
4     43
1     32
5     20
Name: cluster, dtype: int64

In [108]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
terms = vectorizer.get_feature_names()
num_clusters = len(yay['cluster'].unique())
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % terms[ind].split(' '))
    print() 
    print() 
    
print()
print()

Top terms per cluster:

Cluster 0 words: ['motion']
 ['cloture']
 ['cloture', 'motion']
 ['united']
 ['states']
 ['united', 'states']


Cluster 1 words: ['comprehensive', 'immigration']
 ['immigration']
 ['comprehensive', 'immigration', 'reform']
 ['immigration', 'reform']
 ['comprehensive']
 ['immigration', 'reform', 'act']


Cluster 2 words: ['title', 'file']
 ['short', 'title', 'file']
 ['short']
 ['short', 'title']
 ['file']
 ['title']


Cluster 3 words: ['amdt']
 ['act']
 ['appropriations', 'act']
 ['appropriations']
 ['act', 'amendment', 'amdt']
 ['act', 'amendment']


Cluster 4 words: ['defense', 'authorization', 'act']
 ['national', 'defense']
 ['national', 'defense', 'authorization']
 ['defense', 'authorization']
 ['act', 'fiscal', 'year']
 ['act', 'fiscal']


Cluster 5 words: ['small', 'business', 'tax']
 ['business', 'tax', 'relief']
 ['business', 'tax']
 ['small', 'business']
 ['small']
 ['tax', 'relief', 'act']


Cluster 6 words: ['motion', 'waive']
 ['waive']
 ['motion']


In [106]:
order_centroids

array([[4965, 1629, 1669, ..., 5114, 5113, 7715],
       [1881, 3855, 1882, ..., 5530, 5531,    0],
       [7660, 7093, 7091, ..., 5500, 5501,    0],
       ..., 
       [7106, 1332, 1331, ..., 5546, 5547,    0],
       [5111, 8034, 4965, ..., 5432, 5433,    0],
       [2789, 3252, 7019, ..., 5492, 5493,    0]])