In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import gensim
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np
#nltk.download()


In [None]:
data_train = pd.read_csv("information_train.csv", sep = '\t')
train = pd.read_csv("train.csv")
data_test = pd.read_csv("information_test.csv", sep = '\t')
test = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submission.csv")

In [None]:
data_test = data_test.rename(columns= {'set': 'article_set'})

In [None]:
data_train.info()

In [None]:
data_test.head()

In [None]:
import datetime
data_test['pub_date'] = pd.to_datetime(data_test['pub_date'])
data_test.head()

In [None]:
data_test.info()

In [None]:
data_test.article_set.value_counts()

In [None]:
data_test_copy = data_test.copy()

In [None]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df



In [None]:
standardize_text(data_test_copy, "article_title")
standardize_text(data_test_copy, "abstract")

In [None]:
def preprocessing(df,text_field):
    df['tokens'] = df[text_field]
    df['tokens'] = df.tokens.str.split()
    stop = stopwords.words('english')
    df['tokens'] = df['tokens'].apply(lambda x: [item for item in x if item not in stop])
    df['tokens'] = df['tokens'].apply(lambda x: [porter.stem(y) for y in x])
    return df


In [None]:
porter = nltk.PorterStemmer()
def get_similarities(df):  # pass text_field_byset
    dic = {}
    dic_tokens = {}
    for x in df.article_set.unique():
        final_tokens = [token for token in df.loc[df.article_set == x].tokens] 
        dictionary = gensim.corpora.Dictionary(final_tokens)
        corpus = [dictionary.doc2bow(final_tokens) for final_tokens in final_tokens]
        dic_tokens[x] = final_tokens
    
        tf_idf = gensim.models.TfidfModel(corpus)
        sims = gensim.similarities.Similarity('C:/Users/Manohar Battula/Desktop/Innoplexus Hack',tf_idf[corpus],
                                      num_features=len(dictionary))
        similarities = sims[tf_idf[corpus]]
        dic[x]= similarities
    return dic


In [None]:
def top5(sim_dic,art_set):
    top_docs = sim_dic[art_set][1].argsort()[::-1][:5]
    top_docs_with_score = [(index, round(sim_dic[art_set][1][index], 3))
                                for index in top_docs]
    print(top_docs_with_score)

In [None]:
## Absrtact Model
standardize_text(data_test_copy, 'abstract')
abstract_byset = data_test_copy[['abstract','article_set']].copy()
preprocessing(abstract_byset,'abstract')
abstract_dic= get_similarities(abstract_byset)

In [None]:
## Title Model 
standardize_text(data_test_copy, 'article_title')
title_byset = data_test_copy[['article_title','article_set']].copy()
preprocessing(title_byset,'article_title')
title_dic= get_similarities(title_byset)

In [None]:
def get_pred_dfs(df,sim_dic1, sim_dic2): 
    
    ### df --> data_test                                                                                 
    #pred =[]
    
    pmid_top = pd.DataFrame(np.zeros((0,10)), \
                            columns=['top1','top2','top3','top4','top5','top6','top7','top8','top9','top10'])
    date_top = pd.DataFrame(np.zeros((0,10)),\
                            columns=['dtop1','dtop2','dtop3','dtop4','dtop5','dtop6','dtop7','dtop8','dtop9','dtop10'])
    
    for i in df.article_set.unique():
        pmid_list_sets = df.loc[df.article_set == i].pmid.tolist()
        date_list_sets = df.loc[df.article_set == i].pub_date.tolist()
        
        combined_sim_list = (np.array(sim_dic1[i]) + np.array(sim_dic2[i])).tolist()
        df_pmid = pd.DataFrame(combined_sim_list, columns = pmid_list_sets)
        df_date = pd.DataFrame(combined_sim_list, columns = date_list_sets)
        x = df_pmid.T
        y = df_date.T
        
        for j in x.columns:
            df1row = pd.DataFrame(x.nlargest(10, j).index.tolist(), \
                                  index=['top1','top2','top3','top4','top5','top6','top7','top8','top9','top10']).T
            pmid_top = pd.concat([pmid_top.astype(int), df1row], axis=0)
            #pmid_top= pmid_top.drop(['top1'], axis=1)
            #restolist = pmid_top.values.tolist()
        
        for j in y.columns:
            df1row = pd.DataFrame(y.nlargest(10, j).index.tolist(),\
                                  index=['dtop1','dtop2','dtop3','dtop4','dtop5','dtop6','dtop7','dtop8','dtop9','dtop10']).T
            date_top = pd.concat([date_top, df1row], axis=0)
            #date_top= date_top.drop(['top1'], axis=1)
            #restolist = date_top.values.tolist()
        #for k in range(0,len(restolist)):
            #pred.append(restolist[k])
    return pmid_top,date_top


In [None]:
## combined model date 
pmid_top , date_top = get_pred_dfs(data_test, abstract_dic, title_dic)

In [None]:
def get_prediction(pmid_top,date_top):
    
    pred =[]
    for i in range(0,len(pmid_top)):
        x=[]
        for j in range(0,date_top.shape[1]):
            if ((date_top.iloc[i][0] > date_top.iloc[i][j]) and len(x)<3):
                y = pmid_top.iloc[i][j] 
                x.append(y) 
        pred.append(x)
    preddf = pd.DataFrame()
    preddf['pmid'] = data_test.pmid
    preddf['ref_list'] = pred
    prediction = pd.merge(test,preddf, on='pmid', how='left')
    return prediction

In [None]:
solution = get_prediction(pmid_top,date_top)

In [None]:
'''
from sklearn.feature_extraction.text import TfidfVectorizer

def get_similarities_tfidf(df,text_field):     # pass data_test
    dic = {}
    #dic_tokens = {}
    for x in df.article_set.unique():
        vectorizer = TfidfVectorizer(max_df=0.3, lowercase = True, ngram_range= (1,3),
                             min_df=1, stop_words='english')
                             
        
        corpus = [y for y in df.loc[df.article_set == x][text_field]]
        dtm = vectorizer.fit_transform(corpus)
        tfidf_sims = gensim.similarities.Similarity('C:/Users/Manohar Battula/Desktop/Innoplexus Hack',
                                                    dtm,num_features=dtm.shape[1])
        similarities = tfidf_sims[dtm]
        dic[x]= similarities
    return dic

title_dic_tfidf = get_similarities_tfidf(data_test,'article_title')
abstract_dic_tfidf = get_similarities_tfidf(data_test,'abstract')
'''

In [None]:
'''
### Lsi Model

from gensim import corpora, models, similarities
def get_similarities_lsi(df):  # pass article_title_byset
    dic = {}
    dic_tokens = {}
    for x in df.article_set.unique():
        final_tokens = [token for token in df.loc[df.article_set == x].tokens] 
        dictionary = gensim.corpora.Dictionary(final_tokens)
        corpus = [dictionary.doc2bow(final_tokens) for final_tokens in final_tokens]
        dic_tokens[x] = final_tokens
    
        tf_idf = gensim.models.TfidfModel(corpus)
        #corpus_tfidf = tf_idf[corpus]
        lsi = models.LsiModel(corpus)
        sims = gensim.similarities.MatrixSimilarity(lsi[corpus])
                                                   
                                      
        similarities = sims[lsi[corpus]]
        dic[x]= similarities
    return dic
    
title_dic_lsi = get_similarities_lsi(article_title_byset)
abstract_dic_lsi = get_similarities_lsi(abstract_byset)
'''

In [None]:
'''

### word2vec

from gensim.models import Word2Vec
preprocessing(article_title_byset,'article_title')
sentences = [token for token in title_byset.tokens]

word2vec_model = Word2Vec(sentences, min_count = 1)

def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.vocab]
    return np.mean(word2vec_model[doc], axis=0)

from sklearn.metrics.pairwise import cosine_similarity
sims = cosine_similarity(np.array([document_vector(word2vec_model, doc)
                                                       for doc in sentences]))

def get_similarities_word2vec(df):  # EX. pass title_byset, abstract_byset
    dic = {}
    dic_tokens = {}
    sentences = [token for token in df.tokens]
    for x in df.article_set.unique():
        corpus = [token for token in df.loc[df.article_set == x].tokens] 
        word2vec_model = Word2Vec(sentences, min_count = 1)
        sims = cosine_similarity(np.array([document_vector(word2vec_model, doc)
                                                       for doc in corpus]))
        dic[x]= sims
    return dic

title_dic_w2v = get_similarities_word2vec(title_byset)
abstract_dic_w2v = get_similarities_word2vec(abstract_byset)
'''