In [1]:
import pandas as pd
import time
import redis
from flask import current_app
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# import article data from mysql db 
def import_data(data_source):
    ds = pd.read_csv(data_source)
    return ds

In [3]:
def train(data_source):
    start = time.time()
    ds = import_data(data_source)
    info("Training data ingested in %s seconds." % (time.time() - start))

    start = time.time()
    self._train(ds)
    info("Engine trained in %s seconds." % (time.time() - start))    

In [4]:
# refactor the training logic into this function
def _train(**args):
    # place holder
    return 1;

In [5]:
data_file = "./data/testdata.csv"

In [6]:
testdata = import_data(data_file)
testdata

Unnamed: 0,PMID,abstract
0,27270041,Conserved DNA-damage responses (DDRs) sense ge...
1,19465921,Loss-of-function of caretaker genes characteri...
2,28258153,Pulmonary�tuberculosis�(TB) caused by�Mycobact...
3,25398087,Tuberculosis�(TB) is an infectious disease cau...


In [7]:
newdata = pd.read_json("./data/quotes.json")
newdata['abstract'] = newdata['abstract'].map(lambda x: x.lstrip('<abstracttext>')) 
newdata['PMID'] = newdata.index + 10000000
newdata.head()

Unnamed: 0,abstract,PMID
0,Genome-metabolism interactions enable cell gro...,10000000
1,Parasites of wildlife inhabiting urbanised and...,10000001
2,The aim of this study was to evaluate the in v...,10000002
3,"label=""BACKGROUND"" nlmcategory=""BACKGROUND"">G...",10000003
4,Human epidermal growth factor receptor 2 (HER2...,10000004


In [8]:
ds = pd.concat([testdata, newdata])
ds.reset_index(drop=True, inplace=True)
ds.head()

Unnamed: 0,PMID,abstract
0,27270041,Conserved DNA-damage responses (DDRs) sense ge...
1,19465921,Loss-of-function of caretaker genes characteri...
2,28258153,Pulmonary�tuberculosis�(TB) caused by�Mycobact...
3,25398087,Tuberculosis�(TB) is an infectious disease cau...
4,10000000,Genome-metabolism interactions enable cell gro...


In [9]:
#clean up not ASCII chars
ds['abstract'] = ds["abstract"].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
ds.head()

Unnamed: 0,PMID,abstract
0,27270041,Conserved DNA-damage responses (DDRs) sense ge...
1,19465921,Loss-of-function of caretaker genes characteri...
2,28258153,Pulmonary tuberculosis (TB) caused by Mycobact...
3,25398087,Tuberculosis (TB) is an infectious disease cau...
4,10000000,Genome-metabolism interactions enable cell gro...


In [10]:
tf = TfidfVectorizer(analyzer='word',
                             ngram_range=(1, 3),
                             min_df=0,
                             stop_words='english')

In [11]:
tfidf_matrix = tf.fit_transform(ds['abstract'])

In [12]:
tfidf_matrix

<24x6755 sparse matrix of type '<type 'numpy.float64'>'
	with 7392 stored elements in Compressed Sparse Row format>

In [13]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [14]:
cosine_similarities

array([[  1.00000000e+00,   7.08350336e-02,   9.48907221e-03,
          1.17145474e-02,   1.23570782e-02,   5.25414060e-04,
          6.12527566e-04,   0.00000000e+00,   5.42053679e-03,
          1.22335415e-02,   1.11636086e-02,   1.49695646e-02,
          0.00000000e+00,   3.78101135e-03,   2.63357758e-03,
          6.06393055e-03,   1.56072594e-03,   6.43046790e-03,
          1.98705271e-03,   9.69491361e-03,   1.74700854e-02,
          1.85566651e-02,   1.56548212e-03,   1.06514489e-03],
       [  7.08350336e-02,   1.00000000e+00,   3.51376857e-03,
          1.54377881e-02,   1.79369914e-02,   1.73006985e-03,
          1.27422649e-03,   2.72249628e-03,   6.06749107e-03,
          6.96446633e-03,   0.00000000e+00,   1.12738038e-02,
          1.36851527e-02,   1.08023914e-02,   5.74326695e-03,
          1.07539928e-02,   1.93565286e-03,   2.73799875e-03,
          2.25005316e-02,   4.09865221e-02,   5.99599613e-03,
          1.28947233e-02,   5.01327230e-03,   2.87566319e-03],
      

In [15]:
similarities = pd.DataFrame(cosine_similarities)

In [16]:
similarities.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1.0,0.070835,0.009489,0.011715,0.012357,0.000525,0.000613,0.0,0.005421,0.012234,...,0.002634,0.006064,0.001561,0.00643,0.001987,0.009695,0.01747,0.018557,0.001565,0.001065
1,0.070835,1.0,0.003514,0.015438,0.017937,0.00173,0.001274,0.002722,0.006067,0.006964,...,0.005743,0.010754,0.001936,0.002738,0.022501,0.040987,0.005996,0.012895,0.005013,0.002876
2,0.009489,0.003514,1.0,0.169734,0.002138,0.002726,0.012518,0.004569,0.004388,0.009729,...,0.014649,0.0,0.003189,0.016094,0.003397,0.005603,0.004091,0.014141,0.008609,0.010961
3,0.011715,0.015438,0.169734,1.0,0.006952,0.008506,0.019906,0.002501,0.011166,0.018443,...,0.017354,0.004968,0.004102,0.011346,0.004633,0.008034,0.010453,0.026777,0.002858,0.029
4,0.012357,0.017937,0.002138,0.006952,1.0,0.000256,0.004975,0.00075,0.015078,0.006841,...,0.001628,0.009791,0.003044,0.02979,0.031989,0.012183,0.011492,0.007636,0.011709,0.008905


In [17]:
prediction_matrix = pd.DataFrame(data=ds.PMID, columns=['PMID'], index=ds.index)
similarity_scores = pd.DataFrame(data=ds.PMID, columns=['PMID'], index=ds.index)
prediction_matrix

Unnamed: 0,PMID
0,27270041
1,19465921
2,28258153
3,25398087
4,10000000
5,10000001
6,10000002
7,10000003
8,10000004
9,10000005


In [18]:
top_n = 11
if (ds.shape[0] < top_n):
    top_n = ds.shape[0] + 1

for i in range(1,top_n-1):
    col_name = i
    prediction_matrix[col_name] = None
    similarity_scores[col_name] = -1
prediction_matrix.head()

Unnamed: 0,PMID,1,2,3,4,5,6,7,8,9
0,27270041,,,,,,,,,
1,19465921,,,,,,,,,
2,28258153,,,,,,,,,
3,25398087,,,,,,,,,
4,10000000,,,,,,,,,


In [19]:
for idx, row in ds.iterrows():
    print "idx: %s" % idx
    similar_indices = cosine_similarities[idx].argsort()[:-top_n:-1]
    print "similar indices: %s" % similar_indices
    similar_items = [(cosine_similarities[idx][i], ds['PMID'][i]) for i in similar_indices]
    print "similar items: \n %s" % similar_items
    for i in range(1,top_n-1):
        similarity_scores.ix[idx, i] = similar_items[i][0] # insert into a db table with date
        prediction_matrix.ix[idx, i] = similar_items[i][1] # insert into db directly   

idx: 0
similar indices: [ 0  1 21 20 11  4  9  3 10 19]
similar items: 
 [(1.0000000000000009, 27270041), (0.070835033578088083, 19465921), (0.018556665051764107, 10000017), (0.01747008539953784, 10000016), (0.014969564647242915, 10000007), (0.012357078229259046, 10000000), (0.012233541546573032, 10000005), (0.011714547409231934, 25398087), (0.011163608584949785, 10000006), (0.0096949136075106246, 10000015)]
idx: 1
similar indices: [ 1  0 19 18  4  3 12 21 11 13]
similar items: 
 [(1.0000000000000091, 19465921), (0.070835033578088083, 27270041), (0.040986522139908603, 10000015), (0.02250053159955331, 10000014), (0.017936991439514927, 10000000), (0.015437788093738278, 25398087), (0.013685152653964082, 10000008), (0.012894723272641261, 10000017), (0.011273803849870076, 10000007), (0.010802391448687646, 10000009)]
idx: 2
similar indices: [ 2  3 10 17 11 14 21  6 12 23]
similar items: 
 [(1.0000000000000069, 28258153), (0.16973364749257461, 25398087), (0.018982143097153299, 10000006), (0.0

In [19]:
prediction_matrix

Unnamed: 0,PMID,1,2,3,4,5,6,7,8,9
0,27270041,19465921,10000017,10000016,10000007,10000000,10000005,25398087,10000006,10000015
1,19465921,27270041,10000015,10000014,10000000,25398087,10000008,10000017,10000007,10000009
2,28258153,25398087,10000006,10000013,10000007,10000010,10000017,10000002,10000008,10000019
3,25398087,28258153,10000019,10000017,10000006,10000002,10000005,10000007,10000010,19465921
4,10000000,10000014,10000013,19465921,10000004,27270041,10000015,10000018,10000016,10000011
5,10000001,10000009,10000013,10000011,10000016,10000004,10000014,10000017,25398087,10000012
6,10000002,10000017,10000008,25398087,10000013,10000010,28258153,10000019,10000004,10000009
7,10000003,10000018,10000005,10000002,28258153,10000004,10000008,10000017,10000001,19465921
8,10000004,10000017,10000013,10000014,10000010,10000000,10000015,10000016,10000019,25398087
9,10000005,10000011,10000007,10000012,25398087,10000013,27270041,10000018,10000017,10000016


In [20]:
similarity_scores

Unnamed: 0,PMID,1,2,3,4,5,6,7,8,9
0,27270041,0.070835,0.018557,0.01747,0.01497,0.012357,0.012234,0.011715,0.011164,0.009695
1,19465921,0.070835,0.040987,0.022501,0.017937,0.015438,0.013685,0.012895,0.011274,0.010802
2,28258153,0.169734,0.018982,0.016094,0.014806,0.014649,0.014141,0.012518,0.011844,0.010961
3,25398087,0.169734,0.029,0.026777,0.02482,0.019906,0.018443,0.017622,0.017354,0.015438
4,10000000,0.031989,0.02979,0.017937,0.015078,0.012357,0.012183,0.011709,0.011492,0.009791
5,10000001,0.034856,0.018945,0.017412,0.010343,0.009791,0.009108,0.009053,0.008506,0.007245
6,10000002,0.021117,0.021007,0.019906,0.013572,0.013514,0.012518,0.011669,0.009862,0.008785
7,10000003,0.008254,0.007132,0.005991,0.004569,0.004129,0.003767,0.00342,0.003197,0.002722
8,10000004,0.03563,0.026261,0.018249,0.015703,0.015078,0.014429,0.011867,0.01129,0.011166
9,10000005,0.026731,0.022347,0.020895,0.018443,0.014084,0.012234,0.011325,0.010647,0.009755
