In [None]:
!pip install xgboost

In [None]:
!pip install seaborn

In [None]:
!pip install protobuf==4.23.4

In [1]:
from __future__ import print_function
import requests
import sys
import time
import urllib
from urllib.request import urlopen
import urllib.request
#import simplejson
import json
import shutil
import glob
#from PIL import Image
from pathlib import Path
#import cv2
#import numpy as np
import io
import xml.etree.ElementTree as ET
import base64
import pandas as pd
#import tensorflow as tf
from datetime import datetime
import torch
from statistics import mean
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import numpy as np
import xgboost as xgb
import math
from sklearn.metrics import r2_score
import seaborn as sns
from sklearn.metrics import mean_squared_log_error
from sklearn import preprocessing
from ast import literal_eval

In [None]:
%pip install sortedcontainers

In [None]:
current_date = int(datetime.today().strftime('%Y%m%d'))
print(current_date)

In [3]:
final_dataset_columns = ['id', 'document_vector', 'ipc_tech_field', 'ind_claims_count', 'bwd_citations_count', 'drawings_count', 'avg_claim_sim_bwd_citations', 'ipc_values_count', 'org_prior_citations_count', 'fwd_citation_count']

In [4]:
def get_doc_vector(title = "", abstract = "", independent_claims = []):
    '''
    title: str
    abstract: str
    independent_claims: [str1, str2]
    
    returns the JSON response with a 384-dimensional vector
    '''
    url_and_query = "http://10.1.0.230:5000/embed_document"
    jsonData = {'title': title,
               'abstract': abstract,
               'independent_claims': independent_claims}
    #print(url_and_query)

    try:
        r = requests.post(url_and_query, json = jsonData)
        r.raise_for_status()
    except:
        print(r)
    response = r.json()
    return response

In [5]:
def strip_tags(xml_string: str) -> str:
    
    blockOrBrElement = set(["br", "address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "p", "pre", "section", "table", "tfoot", "ul", "video"])
    parser = ET.XMLPullParser(['start', 'end'])
    parser.feed(xml_string)
    output = []
    isIndependent = True
    firstClaim = []
    isFirstClaim = True
    for event, elem in parser.read_events():
        if event == 'start':
            if elem.tag == 'x-claim':
                #print(elem.attrib['independent'])
                if elem.attrib.get('independent', " ") == "yes":
                    isIndependent = True
                else:
                    isIndependent = False
                if "first" not in elem.attrib:
                    isFirstClaim = False
            text = elem.text
            if text != None and isIndependent == True:
                output.append(elem.text)
                if isFirstClaim == True:
                    firstClaim.append(elem.text)

        elif event == 'end':
            if elem.tag in blockOrBrElement:
                output.append('')
                if isFirstClaim==True:
                    firstClaim.append('')

            tailText = elem.tail
            if tailText != None and isIndependent == True:
                #e.g. <a>some<b>stuff</b>blah</a>, blah will be the tail text of the <b>
                output.append(tailText)
                if isFirstClaim==True:
                    firstClaim.append(tailText)
    return ''.join(output), ''.join(firstClaim)

In [6]:
def get_claim_vector(claim: str):
    '''
    claim: str
    
    returns the JSON response with a 384-dimensional vector
    '''
    url_and_query = "http://10.1.0.230:5000/embed_query"
    jsonData = {'query': claim,
               "auto_translate": True}
    #print(url_and_query)

    try:
        r = requests.post(url_and_query, json = jsonData)
        r.raise_for_status()
    except:    
        print("claim_vector", r)
    response = r.json()
    return response

In [7]:
def get_top_5_docs(input_doc_vector, earliest_priority_date):
    #new model
    url_and_query = "http://10.1.0.49:8984/solr/ifidev_newvector/select"
    payload = {"q": "{!knn f=tiabindclvector topK=5}" + str(input_doc_vector['vector'])}
    #print(url_and_query)

    r = requests.post(url_and_query, data=payload)
    r.raise_for_status()
    #print("top_5_docs", r)
    response = r.json()
    
    documents = response["response"]["docs"]
    patents = []
    for doc in documents:
        #print(doc)
        patent = doc["id"]
        country = doc["country"]
        #print("\n\ncountry:", country)
        date = str(doc["earliestprioritydate"])
        date_object = datetime.strptime(date, "%Y%m%d").date()
        earliest_priority_date_object = datetime.strptime(earliest_priority_date, "%Y%m%d").date()
        if date_object < earliest_priority_date_object and country == "US":
            patents.append(patent)
    return patents[:5]

In [8]:
def get_bwd_citation_data(patents):
    DATA_ENDPOINT = r"http://10.0.1.106:8983/solr/ificlaims/pbsearch"
    url = f"{DATA_ENDPOINT}?q=pn=("
    fls = "id,titles,claims,abstracts" #"id,abstracts,abstracts_mt,claims,claims_mt"
    #hasRej101, hasRej102, hasRej103, hasRej112
    i=0
    for patent in patents:
        url += f"{patent}%20or%20"
    url = f"{url[:-8]})&fl={fls}"
    url_text = url
    #print(url_text)
    r_text = requests.get(url_text, timeout=36000)
    r_text.raise_for_status()
    response_text = r_text.json()

    final_dataset = pd.DataFrame(columns= ['id', 'title', 'abstract', 'independent_claims'])
    docs = response_text["response"]["docs"]
    for doc in docs:
        claims = doc.get("claims", " ")
        claims_text = []
        for claim in claims:
            claim_text, doc_first_claim = strip_tags(claim)
            claims_text.append(claim_text)
        doc_all_claims = [''.join(claims_text)]
        doc_id = doc.get("id", " ")
        doc_abstract, _ = strip_tags(doc.get("abstracts", " ")[0])
        doc_title = strip_tags(doc.get("titles", " ")[0])
        row = pd.DataFrame([[doc_id, doc_title[0], doc_abstract, doc_all_claims]], columns= ['id', 'title', 'abstract', 'independent_claims'])
        final_dataset = pd.concat([final_dataset, row], ignore_index=True)
    #print("final_dataset:", final_dataset)
    return final_dataset

In [9]:
def get_similarity(vector1, vector2):
    #print(vector1.shape)
    #print(vector2.shape)
    cosine = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)
    return cosine(vector1, vector2)

In [10]:
def get_document_similarity(input_document_vector, patents):
    patent_vectors = []
    
    if len(input_document_vector)==0:
        return 0
    #get title, abstract, and claims
    #print("older patent ids:", patents)
    older_patents = get_bwd_citation_data(patents)
    doc_similarity_scores = []
    for index, row in older_patents.iterrows():
        older_doc_vector = get_doc_vector(row['title'], row['abstract'], row['independent_claims'])
        older_doc_vector = older_doc_vector.get('vector', [])
        if len(older_doc_vector)==0:
            continue
        #print("input vector:", input_document_vector)
        #print("older doc vector:", torch.tensor(older_doc_vector['vector']))
        doc_similarity_scores.append(get_similarity(input_document_vector, torch.tensor(older_doc_vector)))
    
    #print(doc_similarity_scores)
    doc_similarity_scores_list = [t.numpy().item() for t in doc_similarity_scores]
    #print(doc_similarity_scores_list)
    if len(doc_similarity_scores_list) > 0:
        return mean(doc_similarity_scores_list)
    else:
        return 0

In [None]:
import re
from typing import Collection, Set
from sortedcontainers import SortedSet  # Using SortedSet for ordered sets

class TechnologyClassifier:
    first_four_chars_to_sector = {}
    before_slash_to_sector = {}

    @classmethod
    def __init__(cls):
        # These definitions come from https://www.wipo.int/export/sites/www/ipstats/en/statistics/patents/xls/ipc_technology.xls
        ipc_first_four_chars = re.compile(r"([A-Z]\d{2}[A-Z])%")
        ipc_before_slash = re.compile(r"([A-Z]\d{2}[A-Z])\s*([1-9]\d*)/?%")

        with open('ipc_technology.tsv', 'r', encoding='utf-8') as reader:
            for line in reader:
                parts = line.strip().split("\t")
                if len(parts) != 3:
                    raise ValueError("Invalid line in technologies definition file: " + line)

                sector, field, code = map(str.strip, parts)
                m = ipc_first_four_chars.match(code)
                if m:
                    cls.first_four_chars_to_sector[m.group(1)] = [sector, field]
                else:
                    m = ipc_before_slash.match(code)
                    if m:
                        cls.before_slash_to_sector[m.group(1) + m.group(2)] = [sector, field]
                    else:
                        raise ValueError("Unexpected IPC code in technologies definition file: " + line)

    @classmethod
    def classify(cls, ipc_codes_first_four_chars: Collection[str], ipc_before_slash: Collection[str]):
        sectors = SortedSet()
        fields = SortedSet()

        for sub_code in ipc_codes_first_four_chars:
            sector_field = cls.first_four_chars_to_sector.get(sub_code)
            if sector_field:
                sectors.add(sector_field[0])
                fields.add(sector_field[1])

        for sub_code in ipc_before_slash:
            sector_field = cls.before_slash_to_sector.get(sub_code)
            if sector_field:
                sectors.add(sector_field[0])
                fields.add(sector_field[1])

        return TechnologyClassifierResult(sectors, fields)


class TechnologyClassifierResult:
    def __init__(self, sectors: Set[str], fields: Set[str]):
        self.sectors = sectors
        self.fields = fields

tech_classifier = TechnologyClassifier()
result = tech_classifier.classify(["A01B", "C07D"], ["A01B12", "C07D100"])

print("Sectors:", result.sectors)
print("Fields:", result.fields)

In [12]:
final_dataset = None

In [None]:
nextCursorMark = "AoEuQ04tMTAyMjg3OTcwLUE="
url_and_query = "http://solrifi.patdocs.com:8981/solrbackup/ificlaims/pbsearch?q=fam=(pdus=2019 and grant=us and design=false and plant=false) and nctb > 0 and nctf >0&fl=id, grantucid, titles, claims, abstracts, mainipcvalues, assignees_name, independentclaimscount, expectedexpirydate, ipctechnologyfield, usapplicationstatus, ipcrvalues, backwardcitations:[json], backwardcitationscount, drawingscount, forwardcitationscount,forwardcitations:[json]&fq={!collapse field=simplefamily}&expand=true&sort=id asc&rows=100&cursorMark=" + nextCursorMark
query_fwd = "http://solrifi.patdocs.com:8981/solrbackup/ificlaims/pbsearch?q=ctf=(pn=CN-108424449-A)&fq={!collapse field=applicationnumber}&sort=id asc&cursorMark=" + nextCursorMark
try:
    r = requests.get(url_and_query, timeout=36000)
    r.raise_for_status()
except:
    print("overall query", r)
response = r.json()
#print(response)

documents = response["response"]["docs"]
nextCursorMark = response["nextCursorMark"]
print("start")
for doc in documents:
    claims = doc.get("claims", " ")
    claims_text = []
    for claim in claims:
        claim_text, doc_first_claim = strip_tags(claim)
        claims_text.append(claim_text)
    doc_all_claims = [''.join(claims_text)]

    #claim count - independentclaimscount - done
    #number of ipcr values
    #drawing count
    #number of backward citations
    #org/inventor log of past fwd citations
    #Patent similarity with backward citations
    #LDA Topic Models as a Classification Model Input

    doc_id = doc.get("id", " ")
    doc_abstract, _ = strip_tags(doc.get("abstracts", " ")[0])
    doc_title = strip_tags(doc.get("titles", " ")[0])[0]
    doc_ipctechnologyfield = doc.get("ipctechnologyfield", " ")
    doc_forwardcitations = doc.get("forwardcitations", [])
    doc_ipc = doc.get("mainipcvalues", " ")
    doc_grantucid = doc.get("grantucid", " ")
    slash_index = doc_ipc.find("/")

    doc_indclaimscount = doc.get("independentclaimscount", " ")
    doc_drawingscount = doc.get("drawingscount", 0)
    doc_bwdcitationscount = doc.get("backwardcitationscount", 0)
    doc_ipcrvalues = doc.get("ipcrvalues", [])
    doc_ipcrvaluescount = len(doc_ipcrvalues)

    doc_organization_names = doc.get("assignees_name", [])

    doc_org_citations_count = 0
    #http://solrifi.patdocs.com:8981/solrbackup/ificlaims/pbsearch?q=CTF=(pa=FAIRTECH%20INVESTMENT%20LTD)%20AND%20PD=NOW-5y:NOW&fq={!collapse%20field=applicationnumber}&rows=0
    for org in doc_organization_names:
        try:
            query = "http://solrifi.patdocs.com:8981/solrbackup/ificlaims/pbsearch?q=CTF=(pa=" + org + ") AND PD=NOW-5y:NOW&fq={!collapse field=applicationnumber}&rows=0"
            r1 = requests.get(url_and_query, timeout=36000)
            r1.raise_for_status()
        except:
            print("org cit count", r1)
        response1 = r1.json()
        doc_org_citations_count += int(response1["response"].get("numFound", 0))
        time.sleep(1)
        #print(org, doc_org_citations_count)

    #print("prior org nctf", doc_org_citations_count)
    bwdcitations = doc.get("backwardcitations", [])
    bwdcitation_ucids = []
    if len(bwdcitations) > 0:
        for d in bwdcitations:
            b_ucid = d.get("ucid", "")
            if b_ucid is not None and len(b_ucid)>0:
                bwdcitation_ucids.append(b_ucid)           

    if slash_index == -1:
        continue
    result = tech_classifier.classify([doc_ipc[:4]], [doc_ipc[:slash_index]])
    if len(result.fields) == 0:
        continue
    doc_ipctechnologyfield = result.fields[0]
    doc_forwardcitationscount = doc.get("forwardcitationscount", 0)

    #if doc_bwdcitationscount==0: #or doc_forwardcitationscount==0:
        #continue
    
    doc_vector = get_doc_vector(title= doc_title, abstract= doc_abstract, independent_claims= doc_all_claims)

    doc_avgclaimsimilaritybwdcitations = 0
    #print(bwdcitations)
    #print(bwdcitation_ucids)
    if len(bwdcitation_ucids)>0:
        doc_avgclaimsimilaritybwdcitations = get_document_similarity(torch.tensor(doc_vector.get('vector', [])), bwdcitation_ucids)

    #print("sim:", doc_avgclaimsimilaritybwdcitations)

    doc_numipcvalues = len(doc_ipc)
    if doc_grantucid == " " or doc_ipc == " ":
        continue
    row = pd.DataFrame([[doc_id, [doc_vector], doc_ipctechnologyfield, doc_indclaimscount, 
                         doc_bwdcitationscount, doc_drawingscount, doc_avgclaimsimilaritybwdcitations,
                         doc_ipcrvaluescount, doc_org_citations_count, doc_forwardcitationscount]], columns= final_dataset_columns)
    #print(row)
    #['id', 'document_vector', 'ipc_tech_field', 'ind_claims_count', 'bwd_citations_count', 
    # 'drawings_count', 'avg_claim_sim_bwd_citations', 'ipc_values_count', 'org_prior_citations_count', 'fwd_citation_count']
    final_dataset = pd.concat([final_dataset, row], ignore_index=True)
print("\nfinal dataset:", final_dataset)


In [None]:
print(nextCursorMark)

In [None]:
print(nextCursorMark)
final_dataset.to_csv('/home/skalsi@minesoft.local/predict-fwd-citation/fwd_citations_1.csv', index=True)

In [None]:
print(final_dataset)

In [None]:
a = final_dataset['ipc_tech_field'].unique()
print(a)

In [None]:
ipc_tech_field = {}
for tech_field in a:
    v = get_claim_vector(tech_field)
    ipc_tech_field[tech_field] = v['vector']
print(ipc_tech_field)

In [None]:
def get_tech_field_vector(tech_field):
    return ipc_tech_field[tech_field]

In [None]:
final_dataset['ipc_tech_field_vector'] = final_dataset['ipc_tech_field'].apply(get_tech_field_vector)

In [None]:
print(final_dataset.columns)

In [None]:
final_dataset.drop(['document_vector'], axis=1)

In [None]:
final_dataset.to_csv('/home/skalsi@minesoft.local/predict-fwd-citation/fwd_citations_1.csv', index=True)

In [None]:
zero1 = pd.read_csv('/home/skalsi@minesoft.local/predict-fwd-citation/fwd_citations_0_1.csv')
zero2 = pd.read_csv('/home/skalsi@minesoft.local/predict-fwd-citation/fwd_citations_0_2.csv')

In [None]:
zero = [zero1, zero2]

In [None]:
zerodf = pd.concat(zero, ignore_index=True)
print(zerodf)

In [None]:
zerodf = zerodf.drop(['Unnamed: 0'], axis=1)
print(zerodf)

In [None]:
final_dataset=zero1
final_dataset = final_dataset.drop(['Unnamed: 0'], axis=1)

In [None]:
final_dataset = pd.read_csv("/home/skalsi@minesoft.local/predict-fwd-citation/fwd_citations_no_NA.csv")

In [None]:
def get_vector(entry):
    element = literal_eval(entry)
    return element[0].get("vector", 0)

In [None]:
print(final_dataset['document_vector'])

In [None]:
final_dataset['doc_vector'] = final_dataset['document_vector'].apply(get_vector)
print(final_dataset)

In [None]:
print(final_dataset['doc_vector'][0])

In [None]:
final_dataset['ipc_tech_field_vector'] = final_dataset['ipc_tech_field'].apply(get_tech_field_vector)
print(final_dataset['ipc_tech_field_vector'])

In [None]:
zero_df = final_dataset

In [None]:
full_dataset = pd.read_csv('/home/skalsi@minesoft.local/predict-fwd-citation/fwd_citations_no_NA.csv', converters={'doc_vector': pd.eval, 'ipc_tech_field_vector': pd.eval})
#print(full_dataset.info())
#print(full_dataset.loc[full_dataset['doc_vector'] == 0])
#print(full_dataset.loc[full_dataset['ipc_tech_field_vector'] == 0])
#full_dataset = full_dataset[full_dataset.doc_vector != 0]
#full_dataset = full_dataset[full_dataset.ipc_tech_field_vector != 0]
#print(full_dataset.loc[full_dataset['doc_vector'] == 0])
#print(full_dataset.loc[full_dataset['ipc_tech_field_vector'] == 0])
X = full_dataset.drop(['id', 'ipc_tech_field', 'fwd_citation_count'], axis=1)
print(X.info())

In [None]:
print(X)

In [None]:
y = full_dataset['fwd_citation_count'].values
y_transform = []
for cit in y:
    cit_transform = math.log1p(cit)#cit**(1/3)#
    y_transform.append(cit_transform)

In [None]:
print(zero_df.describe(), zero_df.columns)
print(final_dataset.describe(), zero_df.columns)

In [None]:
zero_df = zero_df.drop(['document_vector', 'patent_age'], axis=1)
final_dataset = final_dataset.drop(['Unnamed: 0', 'document_vector', 'patent_age', 'ipc_tech_field_vector'], axis=1)

In [None]:
X = [final_dataset, zero_df]
X = pd.concat(X, ignore_index=True)
print(X)

In [None]:
print(X.describe())

In [None]:
from sklearn.preprocessing import OneHotEncoder
print(X['ipc_tech_field'].unique())
encoder = OneHotEncoder(sparse_output=False)
enc_df = pd.DataFrame(encoder.fit_transform(X[['ipc_tech_field']]))

In [None]:
print(enc_df)
print(len(X['ipc_tech_field'].unique()))
enc_df["combined"] = enc_df.apply(pd.Series.tolist,axis=1)
print(enc_df)

In [None]:
enc_df['combined'] = enc_df['combined'].apply(lambda x: np.array(x))
print(enc_df['combined'])

In [None]:
X['ipc_tech_field_one_hot'] = enc_df['combined']

In [None]:
#concatenate doc vector and ipc tech field vector
'''
X['doc_techfield'] = X.apply(lambda row: np.divide(np.add(np.array(row['doc_vector']).flatten(), np.array(row['ipc_tech_field_vector']).flatten()), 2), axis=1)
#print(X['avg_vector'])
arr_X_avg_vector = []#numpy.zeros((103806, 769))
i = 0
for item in X['avg_vector']:
    arr_X_avg_vector.append(list(item))
    #print(arr_X_avg_vector[i])
    i+=1
X_avg_vector = np.array(arr_X_avg_vector)
print(X_avg_vector.shape)
#l2 normalize X_avg_vector
X_avg_vector_norm = preprocessing.normalize(X_avg_vector, norm='l2')
print(X_avg_vector_norm.shape)
'''

X['doc_techfield'] = X.apply(lambda row: np.concatenate((np.array(row['doc_vector']).flatten(), np.array(row['ipc_tech_field_one_hot']).flatten())), axis=1)
#print(X['concat'])
arr_X_concat = []
i = 0
for item in X['doc_techfield']:
    arr_X_concat.append(list(item))
    #print(arr_X_concat[i])
    i+=1
X_concat = np.array(arr_X_concat)
#l2 normalize X_concat
X_concat_norm = preprocessing.normalize(X_concat, norm='l2')
print(X_concat.shape)
X['doc_techfield'] = pd.Series(X_concat_norm.tolist()).to_frame()


In [None]:
print(len(X['doc_techfield'][0]))

In [None]:
X['concat1'] = X.apply(lambda row: np.concatenate((np.array(row['patent_age']).flatten(), np.array(row['doc_vector']).flatten())), axis=1)
#print(X['concat'])
arr_X_concat = []
i = 0
for item in X['concat1']:
    arr_X_concat.append(list(item))
    #print(arr_X_concat[i])
    i+=1
X_concat1 = np.array(arr_X_concat)
#l2 normalize X_concat
X_concat_norm1 = preprocessing.normalize(X_concat, norm='l2')
print(X_concat1.shape)

X_train, X_test, y_train, y_test = train_test_split(X_concat_norm1, y, test_size=0.1, random_state=42)

In [None]:
print(X_concat_norm)

In [None]:
new_df = pd.Series(X_avg_vector_norm.tolist()).to_frame()

In [None]:
print(new_df)

In [None]:
#concat,doc_vector,fwd_citation_count,fwd_citation_class

In [None]:
new_df['doc_vector'] = final_dataset['doc_vector']
new_df['fwd_citation_count'] = final_dataset['fwd_citation_count']

In [None]:
print(new_df)

In [None]:
new_df.columns = ['doc_ipc_vector_avg', 'doc_vector', 'fwd_citation_count']

In [None]:
print(X.columns)

In [None]:
print(X['ipc_tech_field_one_hot'])

In [None]:
y_count = X['fwd_citation_count'].values
y_class = []
for v in y_count:
    v_class = 0
    if v == 0:
        v_class = 1
    elif 0<v<=3:
        v_class = 2
    elif 3<v<=10:
        v_class = 3
    elif 10<v<=100:
        v_class = 4
    elif 100<v:#<=500:
        v_class = 5
    #elif 500<v<=1000:
       #v_class = 6
    #elif v>1000:
        #v_class = 7

    y_class.append(v_class)
print(y_class)
X['fwd_citation_class']=pd.Series(y_class)
print(X['fwd_citation_class'])

In [None]:
print(new_df)

In [None]:
X.to_csv('/home/skalsi@minesoft.local/predict-fwd-citation/fwd_citations_class_.csv', index=False)

In [None]:
full_zero_df = new_df

In [None]:
full_df = [full_zero_df, new_df]
print(full_df)

In [None]:
full_processed_dataset = pd.concat(full_df, ignore_index=True)

In [None]:
print(full_processed_dataset)

In [None]:
full_processed_dataset.to_csv('/home/skalsi@minesoft.local/predict-fwd-citation/fwd_citations_class_full_processed.csv', index=False)

In [None]:
from ast import literal_eval 
processed_df = pd.read_csv('/home/skalsi@minesoft.local/predict-fwd-citation/fwd_citations_processed_1.csv')

In [None]:
processed_df["concat"] = processed_df["concat"].apply(lambda x: literal_eval(x))
processed_df["concat"] = processed_df["concat"].apply(lambda x: np.array(x))


In [None]:
X_concat = processed_df["concat"].to_numpy()
X_concat = np.vstack(X_concat)
print(X_concat.shape)

In [None]:
y = processed_df["fwd_citation_count"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_concat, y, test_size=0.1, random_state=42)

In [None]:
#without patent age
X_new = X.drop(['patent_age'], axis=1)
X_new['flat_feature'] = X_new.apply(lambda row: np.concatenate((np.array(row['doc_vector']).flatten(), np.array(row['ipc_tech_field_vector']).flatten())), axis=1)
print(X_new['flat_feature'])

arr_X_flat_feature = []#numpy.zeros((103806, 769))
i = 0
for item in X_new['flat_feature']:
    arr_X_flat_feature.append(list(item))
    print(arr_X_flat_feature[i])
    i+=1
X_new_flat_feature = np.array(arr_X_flat_feature)
print(X_new_flat_feature.shape)

In [None]:
print(np.mean(y))
print(np.std(y))
print(np.max(y))

In [None]:
y_df = pd.DataFrame(y, columns =['fwd_citation_count'])
sns.displot(y_df, x = "fwd_citation_count")#.set_title("Distribution of forward citation counts")

## TSNE

In [None]:
#umap dim reduction

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=3)
embedding = tsne.fit_transform(X_only_vector)
X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.3, random_state=42)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(embedding, y_train, s=10, color='blue')
plt.title('2D Plot of y vs. X', fontsize=16)
plt.xlabel('X', fontsize=14)
plt.ylabel('y', fontsize=14)
plt.grid(True)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Create a 3D plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot the data points
ax.scatter(embedding[:, 0], embedding[:, 1], y_train, c=y_train, cmap='viridis')

# Set labels and title
ax.set_xlabel('Vector Dimension 1', fontsize=12)
ax.set_ylabel('Vector Dimension 2', fontsize=12)
ax.set_zlabel('Dependent Variable', fontsize=12)
ax.set_title('3D Plot with Vector Data and Dependent Variable', fontsize=14)

plt.show()

In [None]:
print(X_train.shape)
print(X_train[0].shape)
X_train = np.expand_dims(X_train, axis=1)
print(X_train.shape)

In [None]:
from sklearn.metrics import r2_score
rf_model = RandomForestRegressor(n_estimators=50, random_state=42)
rf_model.fit(X_train[0], y_train)
predictions = rf_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(mse**0.5)
print(r2)

In [None]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
#X_train=X_train.drop(['index'],axis=1)
#y_train=y_train.drop(['index'],axis=1)
print(X_train)
print(y_train)

In [None]:
#xgboost
model = xgb.XGBRegressor(eval_metric = "rmsle")
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mse_xg = mean_squared_error(y_test, predictions)
print(mse_xg**0.5)
r2 = r2_score(y_test, predictions)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

#hyperparameter distributions
param_dist = {
    'max_depth': stats.randint(3, 10),
    'eta': stats.uniform(0.01, 0.1),
    'subsample': stats.uniform(0.5, 0.5),
    'n_estimators':stats.randint(50, 200)
}

# XGBoost model object
xgb_model = xgb.XGBRegressor(eval_metric = "rmsle")
#grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, cv=5)


# Fit the GridSearchCV object to the training data
random_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

In [None]:
import matplotlib.pyplot as plt

model = xgb.XGBRegressor(learning_rate = random_search.best_params_["eta"],
                        max_depth  = random_search.best_params_["max_depth"],
                        subsample = random_search.best_params_["subsample"],
                        n_estimators = random_search.best_params_["n_estimators"])
model.fit(X_train, y_train)
predictions = model.predict(X_test)

pr = []
for p in predictions:
    if p<0:
        p = 0
    pr.append(p)

predictions = np.array(pr)
mse_xg = mean_squared_error(y_test, predictions)

print(mse_xg**0.5)
r2 = r2_score(y_test, predictions)
print(r2)

rmsle = (mean_squared_log_error(y_test, predictions))**0.5
print(rmsle)

In [None]:
# Plot the results
idx = [ i for i in range(10381)]
plt.plot(idx, y_test, color='blue', label='Actual')
plt.plot(idx, predictions, color='red', label='Predicted')
plt.title('XG Boost Regression')
plt.xlabel('Input feature')
plt.ylabel('Target variable')
plt.legend()
plt.show()

In [None]:
print(full_dataset['fwd_citation_count'].describe())

In [None]:
print(np.mean(y_test))
print(np.std(y_test))
print(np.var(y_test))

print("\n\n")

print(np.mean(predictions))
print(np.std(predictions))
print(np.var(predictions))

In [None]:
sns.distplot(data['price_log1p']).set_title("Distribution of log(1 + price)")

### SVM

In [None]:
from sklearn.svm import SVR
import matplotlib.pyplot as plt

svm_regressor = SVR(kernel='poly', C=10, gamma='scale')  #RBF kernel for non-linear regression
svm_regressor.fit(X_train, y_train)

predictions = svm_regressor.predict(X_test)

#Calculate Mean Squared Error (MSE) as a performance metric
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

In [None]:
# Calculate R-squared value
r2 = r2_score(y_test, predictions)
print("R-squared svm regressor", r2)

print("Root Mean Squared Error:", mse**0.5)

print(np.mean(y_test))
print(np.std(y_test))
print(np.var(y_test))

print("\n\n")

print(np.mean(predictions))
print(np.std(predictions))
print(np.var(predictions))

In [None]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, predictions)
print(mae)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
import matplotlib.pyplot as plt

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVR(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

print(grid.best_estimator_)

predictions = grid.predict(X_test)
#Calculate Mean Squared Error (MSE) as a performance metric
mse = mean_squared_error(y_test, predictions)
print("Root Mean Squared Error:", mse**0.5)

In [None]:
from sklearn.metrics import r2_score

# Calculate R-squared value
r2 = r2_score(y_test, predictions)
print("R-squared svm regressor", r2)

# Plot the results
idx = [ i for i in range(31142)]
plt.scatter(idx, y_test, color='blue', label='Actual')
plt.scatter(idx, predictions, color='red', label='Predicted')
plt.title('Support Vector Machine (SVM) Regression')
plt.xlabel('Input feature')
plt.ylabel('Target variable')
plt.legend()
plt.show()

In [None]:
#Artificial Neural Network
from keras.optimizers import Adam
from matplotlib import pyplot
from keras.callbacks import EarlyStopping

train = X_train
target = np.array(y_train)
test = X_test
NN_model = tf.keras.Sequential()
#NN_model.add(tf.keras.layers.Dense(64, kernel_initializer='normal',input_dim = train.shape[1], activation='relu'))
#NN_model.add(tf.keras.layers.Dropout(0.25))
#NN_model.add(tf.keras.layers.Dense(256, kernel_initializer='normal', input_dim = train.shape[1], activation='relu'))
NN_model.add(tf.keras.layers.Dropout(0.5, input_dim = train.shape[1]))
NN_model.add(tf.keras.layers.Dense(256, kernel_initializer='normal', activation='relu'))
NN_model.add(tf.keras.layers.Dense(128, kernel_initializer='normal', activation='relu'))
NN_model.add(tf.keras.layers.Dropout(0.5))
NN_model.add(tf.keras.layers.Dense(32, kernel_initializer='normal',activation='relu'))
NN_model.add(tf.keras.layers.Dense(8, kernel_initializer='normal',activation='relu'))
NN_model.add(tf.keras.layers.Dropout(0.5))
NN_model.add(tf.keras.layers.Dense(1, kernel_initializer='normal',activation='linear'))
NN_model.compile(loss='mean_squared_error', optimizer=Adam(lr=1e-3), metrics=['mean_squared_error'])
print(NN_model.summary())

checkpoint_name = 'Weights-0.5-256-128-0.5-32-8-0.5-1-mse-{epoch:03d}--{val_loss:.5f}.hdf5'
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint, es]
history = NN_model.fit(train, target, epochs=1000, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)
print(history)
print(NN_model)
predictions = NN_model.predict(test)

In [None]:
print(history.history.keys())

In [None]:
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
print(predictions)

In [None]:
r2 = r2_score(y_test, predictions)
print(r2)
adj_r2 = 1-(1-r2)*(len(y_test)-1)/(len(y_test)-1)
print(adj_r2)
mse_nn = mean_squared_error(y_test, predictions)
print(mse_nn**0.5)
print(mse_nn)
mae_nn = mean_absolute_error(y_test, predictions)
print(mae_nn)

In [None]:
print(np.mean(y_test))
print(np.std(y_test))
print(np.var(y_test))

print("\n\n")

print(np.mean(predictions))
print(np.std(predictions))
print(np.var(predictions))

In [None]:
# Plot the results
idx = [ i for i in range(31142)]
plt.scatter(idx, y_test, color='blue', label='Actual')
plt.scatter(idx, predictions, color='red', label='Predicted')
plt.title('NN')
plt.xlabel('Input feature')
plt.ylabel('Target variable')
plt.legend()
plt.show()

In [None]:
plt.plot(y_test[:100], color='blue', linestyle = 'dotted')
plt.plot(predictions[:100], color='red', linestyle = 'dotted')
plt.show()