In [4]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import re, math
from collections import Counter
import time

from __future__ import print_function
import annoy
from implicit import alternating_least_squares

In [21]:
df = pd.read_csv("/Users/nickwalker/Desktop/Coding/Python/Congress Data/bills_with_sponsor.csv")
df1 = pd.read_csv("/Users/nickwalker/Desktop/Coding/Python/Congress Data/bill_cosponsorship_1.csv")
df2 = pd.read_csv("/Users/nickwalker/Desktop/Coding/Python/Congress Data/bill_details.csv")
df3 = pd.DataFrame(df1)
df4 = pd.DataFrame(df2)

In [22]:
df1.head()

Unnamed: 0,member_id,bill_number,date,withdrawn_date
0,K000188,H.CON.RES.10,1/13/2017,1
1,N000015,H.CON.RES.10,1/23/2017,1
2,T000462,H.CON.RES.10,1/23/2017,1
3,P000593,H.CON.RES.10,1/31/2017,1
4,J000290,H.CON.RES.10,2/7/2017,1


In [7]:
def cosine_sim(a, b):
    """Takes 2 vectors a, b and returns the cosine similarity according 
    to the definition of the dot product"""
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [8]:
def makeVector(self, wordString):
        """ @pre: unique(vectorIndex) """

        #Initialise vector with 0's
        vector = [0] * len(self.vectorKeywordIndex)
        wordList = self.parser.tokenise(wordString)
        wordList = self.parser.removeStopWords(wordList)
        for word in wordList:
                vector[self.vectorKeywordIndex[word]] += 1; #Use simple Term Count Model
        return vector

In [29]:
def read_data(filename):
    """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset
    data = pd.read_table(filename,
                        usecols=[0, 1, 3],
                        names=['member_id', 'bill_number', 'plays'])

    # map each artist and user to a unique numeric value
    data['member_id'] = data['member_id'].astype("category")
    data['bill_number'] = data['bill_number'].astype("category")

    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(float),
                       (data['member_id'].cat.codes.copy(),
                        data['bill_number'].cat.codes.copy())))

    return data, plays

In [1]:
"""#df3.iloc[:,[0,2]]
def read_data(filename):
    df1 = pd.read_csv(filename)
    df = pd.DataFrame(df1)
    df = df.iloc[:,[0,1]]
    
    df['member_id'] = df['member_id'].astype("category")
    df['bill_number'] = df['bill_number'].astype("category")
    
    out = coo_matrix((df['member_id'].cat.codes.copy(),
                     df['bill_number'].cat.codes.copy()))
    return out"""

'#df3.iloc[:,[0,2]]\ndef read_data(filename):\n    df1 = pd.read_csv(filename)\n    df = pd.DataFrame(df1)\n    df = df.iloc[:,[0,1]]\n    \n    df[\'member_id\'] = df[\'member_id\'].astype("category")\n    df[\'bill_number\'] = df[\'bill_number\'].astype("category")\n    \n    out = coo_matrix((df[\'member_id\'].cat.codes.copy(),\n                     df[\'bill_number\'].cat.codes.copy()))\n    return out'

<h2> BM25 Weighting </h2>
- BM - Best Matching
- Is one of the best probablistic weighting schemes
- is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document alt text

In [12]:
def bm25_weight(X, K1=3000, B=0.8):
    """ Weighs each row of the sparse matrix of the data by BM25 weighting """
    # calculate idf per term (user)
    X = coo_matrix(X)
    N = X.shape[0]
    idf = np.log(float(N) / (1 + np.bincount(X.col)))
    
    # calculate length_norm per document 
    row_sums = np.ravel(X.sum(axis=1))
    avg_length = row_sums.mean()
    length_norm = (1.0 - B) + B * row_sums / avg_length
    
    # weight matrix rows by BM25
    X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
    return X

In [15]:
class TopRelated(object):
    def __init__(self, congress_factors):
        # fully normalize congress_factors, so we can compare with only the dot product
        norms = np.linalg.norm(congress_factors, axis=-1)
        self.factors = congress_factors / norms[:, np.newaxis]
        
    def get_related(self, congressID, N=10):
        scores = self.factors.dot(self.factors[congressID])
        best = numpy.argpartition(scores, -N)[-N:]
        return sorted(zip(best, scores[best]), key=lambda x: -x[1])

In [16]:
class ApproximateTopRelated(object):
    def __init__(self, congress_factors, treecount=20):
        index = annoy.AnnoyIndex(congress_factors.shape[1], 'angular')
        for i, row in enumerate(congress_factors):
            index.add_item(i, row)
        index.build(treecount)
        self.index = index

    def get_related(self, congressID, N=5):
        neighbors = self.index.get_nns_by_item(congressID, N)
        return sorted(((other, 1 - self.index.get_distance(congressID, other))
                      for other in neighbors), key=lambda x: -x[1])

In [17]:
def calculate_similar_members(input_filename, output_filename,
                              factors=30, regularization=0.01,
                              iterations=40,
                              exact=False, trees=20,
                              use_native=True,
                              dtype=np.float64,
                              cg=False):
    print("Calculating similar congress members. This might take a while")
    print("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    print("read data file in %s", time.time() - start)

    print("weighting matrix by bm25")
    weighted = bm25_weight(plays)

    print("calculating factors")
    start = time.time()
    congress_factors, user_factors = alternating_least_squares(weighted,
                                                             factors=factors,
                                                             regularization=regularization,
                                                             iterations=iterations,
                                                             use_native=use_native,
                                                             dtype=dtype,
                                                             use_cg=cg)
    print("calculated factors in %s", time.time() - start)
    
    # write out artists by popularity
    print("calculating top congress members")
    user_count = df.groupby('member_id').size()
    members = dict(enumerate(df['member_id'].cat.categories))
    to_generate = sorted(list(members), key=lambda x: -user_count[x])
    
    
    print(congress_factors)
    if exact:
        calc = TopRelated(congress_factors)
    else:
        calc = ApproximateTopRelated(congress_factors, trees)

    print("writing top related to %s", output_filename)
    with open(output_filename, "w") as o:
        for congressID in to_generate:
            member = members[congressID]
            for other, score in calc.get_related(congressID):
                o.write("%s\t%s\t%s\n" % (member, members[other], score))

In [None]:
calculate_similar_members("/Users/nickwalker/Desktop/Coding/Python/Congress Data/bill_cosponsorship_1.csv", 'Congress Data/billPerson')