# Etude des composés retenus

Objectif : Récupérer les composés sélectionnés et pour ceux qui ont un brevet, quel type de brevet.

## Prérequis



In [1]:
from google.colab import drive
drive.mount('/content/drive')

from pickle import load, dump
import requests
from bs4 import BeautifulSoup as bs

import ast

import pandas as pd

from tqdm import tqdm
tqdm.pandas()

import re

import seaborn as sns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Fonction pour convertir la représentation en chaine de caractères d'une liste vers une liste
def charListToList(charList):
  if charList == "None":
    y = "None"
  elif charList[0] != "[" :
    y = charList
  else :
    y = ast.literal_eval(charList)
    y = [n.strip() for n in y]
  return(y)



# Fonction pour récupérer la descirption d'un brevet à partir de son identifiant
def query_patentDescription(patentID):
  """
  Query the patent description from a patent ID

  Parameters
  ----------

  patentID : str
      ID number of patent

  Returns
  -------
  str
      Description of patent
  """
  try :
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/patent/{patentID}/JSON"
    r = requests.get(url)
    r.raise_for_status()
    key = r.json()["Record"]["RecordTitle"]
  except :
    key = "No title found"
  return(key)

# Fonction pour recoder les codes des brevets pour correspondre au format XX-X...X-XX
def recodePatentID(patentID):
  y = patentID[:2]+"-"+patentID[2:-2]+"-"+patentID[-2:]
  return(y)


def getPatentDescription(patentID):
  """
  Test to retrieve patent description from a patentID

  Parameters
  ----------
  patentID : str
      ID number of patent
  
  Returns
  -------
  str
      Patent description 

  """
  if patentID == "None" :
    y = "No title found"
  else :
    y = query_patentDescription(patentID)
    if y == "No title found" :
      patentID_recoded = recodePatentID(patentID)
      y = query_patentDescription(patentID_recoded)
    if y == "No title found" :
      y = requests.get("https://patents.google.com/patent/"+patentID+"/en")
      y_soup = bs(y.content, "html.parser")
      y = y_soup.find_all("title")[0]
      y = y.contents[0].split("\n")[0]
      #.split(" - ")[1]
      if y == "Error 404 (Not Found)!!1" :
        y = "No title found"
  
  return(y)

## Chargement des données

In [3]:
# Récupération de l'identifiant uniprot
with open("/content/drive/MyDrive/Colab Notebooks/these_exercice/output/01_compoundDataAcquisition/uniprot_id.txt", "rb") as file:
    uniprot_id = load(file)
print(uniprot_id)
# Récupération de toutes les molécules ayant des valeurs de pIC50 pour la protéine d'intérêt
molecules = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/these_exercice/output/07_predictIC50/mol_pIC50sup9_{uniprot_id}.csv")


P08581


## EDA
### Nombre de molécules sous brevet 




In [4]:
print(f"{sum(molecules.patentID != 'None')} molécules sont sous brevet")

315 molécules sont sous brevet


### Nombres de brevet par molécules

In [5]:
molecules.patentID = molecules.patentID.apply(charListToList)

In [6]:
molecules["n_patent"] = molecules.patentID.apply(lambda x : len(x) if x != "None" else 0)

In [7]:
molecules[molecules.n_patent == max(molecules.n_patent)]

Unnamed: 0,smiles,molecular_weight,n_hba,n_hbd,logp,lipinski,fingerprint_df,predicted_pIC50,cid,patentID,n_patent
454,CCN(CC)CCN1C2=C(C=C(C=C2)[N+](=O)[O-])N=C1CC3=...,386.150954,5.0,0.0,4.5305,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,9.068846,62528,"[AT-11571-U1, AT-511581-A1, AU-1805901-A, AU-1...",15394


In [9]:
molecules.shape

(530, 11)

In [10]:
molecules.head(50)

Unnamed: 0,smiles,molecular_weight,n_hba,n_hbd,logp,lipinski,fingerprint_df,predicted_pIC50,cid,patentID,n_patent
0,C1OC2=C(O1)C=C(C=C2)OCC3=NN=C(N3C4=CC=CC=C4)SC...,462.099791,9.0,0.0,4.7755,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,11.304494,2196336,,0
1,CC1=NC2=CC=CC=C2C(=C1)CNC(=O)CN3C(=C(C(=N3)C)[...,353.148789,6.0,1.0,2.58116,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,11.02791,1504530,,0
2,CC=C(C1=CC(=CC=C1)[N+](=O)[O-])N2C(=NN=C2SCC3=...,410.104876,8.0,0.0,4.42482,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,10.971951,0,,0
3,CC1CC2=NN(C(=C2C3=NC(=NC=C13)NC4=[N+](NC(=C4)C...,420.169798,5.0,2.0,4.06172,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,10.967983,86765938,"[US-2012238542-A1, US-8735386-B2, US2012023854...",4
4,COC(=O)NC1=NC2=C(N1)C=C(C=C2)SC3=NN=C4N3N=C(C=...,435.091372,8.0,2.0,4.1362,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,10.868773,91262312,"[AU-2008320791-A1, AU-2008320791-B2, EP2178881...",16
5,C1OC2=C(O1)C=C(C=C2)OCC3=NN=C(N3C4=CC=CC=C4)SC...,485.115775,10.0,0.0,4.9173,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,10.832048,5301698,,0
6,CC1=CN=C(C=C1)NC(=O)C2=C(NC3=C(C2C4=CC(=C(C=C4...,480.156433,6.0,2.0,5.19432,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,10.77725,3682580,,0
7,CC1=C(C(C2=C(N1)CC(CC2=O)(C)C)C3=C(C=CC(=C3)[N...,466.140783,6.0,2.0,4.8859,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,10.77725,5057008,,0
8,C1OC2=C(O1)C=C(C=C2)OCC3=NN=C(N3C4=CC=CC=C4)SC...,463.100205,8.0,0.0,4.6891,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,10.767604,2104621,,0
9,CC1=CC(=NN1)NC2=CC(=NC3=NC(=NN23)SCC4=NC5=CC=C...,391.132763,8.0,3.0,3.37644,True,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,10.702793,162649451,,0


In [8]:
# Enregistrement des patentID restant à trouver leur description
with open("/content/drive/MyDrive/Colab Notebooks/these_exercice/input/08_exploreMolecules/remainingPatentDesc", "wb") as file:
    dump(molecules.patentID, file)
# Initialisation de la liste des descriptions
with open("/content/drive/MyDrive/Colab Notebooks/these_exercice/input/08_exploreMolecules/PatentDesc", "wb") as file:
      dump(list(), file)

In [None]:
# Load initial patentID list
with open("/content/drive/MyDrive/Colab Notebooks/these_exercice/input/08_exploreMolecules/remainingPatentDesc", "rb") as file:
    patentIDs = load(file)
with open("/content/drive/MyDrive/Colab Notebooks/these_exercice/input/08_exploreMolecules/PatentDesc", "rb") as file:
    listDescription = load(file)

remainingIDs = patentIDs.reset_index(drop = True)

for listID in tqdm(range(len(patentIDs))) :
  tmplistID = molecules.patentID[listID]
  print(f"NUmber fo patent to retrieve : {len(tmplistID)}")
  if tmplistID == "None" :
    listDescription.append("No title found")
    remainingIDs = remainingIDs.drop(listID)
  else :
    tmpDesc = []
    for ID in tmplistID :
      tmpDesc.append(getPatentDescription(ID))
    listDescription.append(tmpDesc)
    remainingIDs = remainingIDs.drop(listID)
    # Enregistrement des IDs restant
    with open("/content/drive/MyDrive/Colab Notebooks/these_exercice/input/08_exploreMolecules/remainingPatentDesc", "wb") as file:
      dump(remainingIDs, file)
    # Enregistrement de la description faite
    with open("/content/drive/MyDrive/Colab Notebooks/these_exercice/input/08_exploreMolecules/PatentDesc", "wb") as file:
      dump(listDescription, file)

  0%|          | 0/46 [00:00<?, ?it/s]

NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 4


  4%|▍         | 2/46 [00:13<04:55,  6.72s/it]

NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 132


  9%|▊         | 4/46 [05:02<1:01:22, 87.67s/it]

NUmber fo patent to retrieve : 2


 11%|█         | 5/46 [05:09<43:34, 63.77s/it]  

NUmber fo patent to retrieve : 49


 13%|█▎        | 6/46 [06:46<49:11, 73.79s/it]

NUmber fo patent to retrieve : 5


 15%|█▌        | 7/46 [06:50<34:27, 53.01s/it]

NUmber fo patent to retrieve : 9


 17%|█▋        | 8/46 [07:31<31:21, 49.52s/it]

NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 1


 28%|██▊       | 13/46 [07:32<08:43, 15.87s/it]

NUmber fo patent to retrieve : 4


 30%|███       | 14/46 [07:34<07:17, 13.68s/it]

NUmber fo patent to retrieve : 46


 33%|███▎      | 15/46 [08:04<08:39, 16.76s/it]

NUmber fo patent to retrieve : 14


 35%|███▍      | 16/46 [08:45<10:56, 21.87s/it]

NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 10


 46%|████▌     | 21/46 [08:48<03:46,  9.07s/it]

NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 17


 52%|█████▏    | 24/46 [09:27<03:48, 10.39s/it]

NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 76


 57%|█████▋    | 26/46 [10:18<04:40, 14.03s/it]

NUmber fo patent to retrieve : 43


 59%|█████▊    | 27/46 [10:48<05:13, 16.51s/it]

NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 11


 63%|██████▎   | 29/46 [11:37<05:20, 18.85s/it]

NUmber fo patent to retrieve : 3


 65%|██████▌   | 30/46 [11:40<04:17, 16.11s/it]

NUmber fo patent to retrieve : 9


 67%|██████▋   | 31/46 [11:54<03:53, 15.59s/it]

NUmber fo patent to retrieve : 10


 70%|██████▉   | 32/46 [12:34<04:54, 21.01s/it]

NUmber fo patent to retrieve : 27


 72%|███████▏  | 33/46 [12:57<04:39, 21.48s/it]

NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 1


 76%|███████▌  | 35/46 [12:57<02:20, 12.81s/it]

NUmber fo patent to retrieve : 217


 78%|███████▊  | 36/46 [15:30<07:24, 44.45s/it]

NUmber fo patent to retrieve : 17


 80%|████████  | 37/46 [15:49<05:43, 38.22s/it]

NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 54


 85%|████████▍ | 39/46 [16:10<03:07, 26.84s/it]

NUmber fo patent to retrieve : 7


 87%|████████▋ | 40/46 [16:19<02:16, 22.79s/it]

NUmber fo patent to retrieve : 94


 89%|████████▉ | 41/46 [17:30<02:52, 34.50s/it]

NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 4
NUmber fo patent to retrieve : 2


 98%|█████████▊| 45/46 [17:30<00:14, 14.51s/it]

NUmber fo patent to retrieve : 1


100%|██████████| 46/46 [17:31<00:00, 22.85s/it]


In [None]:
with open("/content/drive/MyDrive/Colab Notebooks/these_exercice/input/08_exploreMolecules/remainingPatentDesc", "rb") as file:
    patentIDs = load(file)
with open("/content/drive/MyDrive/Colab Notebooks/these_exercice/input/08_exploreMolecules/PatentDesc", "rb") as file:
    listDescription = load(file)

print(len(patentIDs))
print(listDescription)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

