# Project #2 Data Cleaning with Text in Pandas

### Importing Packages

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import re
from pathlib import Path

## Importing Data

In [2]:
Path = r"/Users/peterlyon/Downloads/Coding+Test+L3+Datafiles/patent_drawing.csv"
Patents = pd.read_csv(os.path.normpath(Path))
Patents

Unnamed: 0,uuid,patent_id,text
0,urfl2ulyjgez01g5selfflnz7,4491930,A better understanding of the invention may be...
1,nqdxwthotlcted3d961ao373x,4490979,A better understanding of the invention will b...
2,9mwinm7as0p0j3245tdxhfuiz,4491969,A better understanding of the invention will b...
3,l1n6w0ofqic6yow2t7qwmvqry,4490948,A better understanding of the present inventio...
4,86bndneq4omf3mfxi60dzr5mx,4491426,A better understanding of the present inventio...
...,...,...,...
8151,bs3gaqy956cbznfcscxl8zrrj,4491812,While the specification concludes with claims ...
8152,nyj0iok0vet86cdkt0h70f53r,4491485,"With particular reference to FIG. 2, it is not..."
8153,wpvtafbht8yhm3lcvlc53r7hv,4491167,With the above and other objects and advantage...
8154,jwmtihlklmnjnjaok71ngr8ia,4490920,"With the foregoing and other objects in view, ..."


In [3]:
Patents.describe()

Unnamed: 0,uuid,patent_id,text
count,8156,8156,8156
unique,8156,1096,7722
top,urfl2ulyjgez01g5selfflnz7,4491287,BRIEF DESCRIPTION OF THE DRAWING
freq,1,59,120


## Question 1

How many of the descriptions mention an embodiment or that they embody something? That is, in how many of the description does the stem “embod-” appear with any ending? (i.e., embody, embodiment, embodying, etc. should all count).

In [4]:
Patents.loc[(Patents["text"].str.contains(r'embod', case=False, regex=True))]

Unnamed: 0,uuid,patent_id,text
6,hi5rsi8g5wx87octi0c6yqzfs,4491821,A better understanding of the present inventio...
7,0pw76mclimgppe77x6lzmfoa1,4491934,A better understanding of the present inventio...
13,kowiz2l3dzwexqfik80zyk3dh,D276953,A front perspective view with the front top fl...
24,aij6voi71zrp8flvp50ng5zjn,4491267,A preferred embodiment is seen in the attached...
25,6frag6oj4fnsef5yz99cf177d,4491019,A preferred embodiment of the invention is des...
...,...,...,...
8140,yn4e3eaegufv7wfvun77zccp8,4491687,Two embodiments of the invention are described...
8141,b5eu7hz1hz0alu358kxk8arg2,4491066,Two embodiments of the invention will now be d...
8148,f1ueevkbtxs9n4x5tebzbkyo9,4491045,While the specification concludes with claims ...
8153,wpvtafbht8yhm3lcvlc53r7hv,4491167,With the above and other objects and advantage...


In [5]:
Patents["Embody"] = Patents["text"].str.contains(r'embod', case=False, regex=True)

In [6]:
Patents.head()

Unnamed: 0,uuid,patent_id,text,Embody
0,urfl2ulyjgez01g5selfflnz7,4491930,A better understanding of the invention may be...,False
1,nqdxwthotlcted3d961ao373x,4490979,A better understanding of the invention will b...,False
2,9mwinm7as0p0j3245tdxhfuiz,4491969,A better understanding of the invention will b...,False
3,l1n6w0ofqic6yow2t7qwmvqry,4490948,A better understanding of the present inventio...,False
4,86bndneq4omf3mfxi60dzr5mx,4491426,A better understanding of the present inventio...,False


In [7]:
Patents["Embody"].value_counts()

False    6839
True     1317
Name: Embody, dtype: int64

## Question 1 Answer

1317 Cases of “embod-” appear with various endings.

## Question 2

What if we are specifically interested in drawing descriptions that embody an invention? So, here, we would like to identify descriptions that contain phrases like the &quot;embodiment of the present invention, "embodies my invention", etc. How many descriptions have the word  “embod-” (with any ending) followed by "invention", even if the two terms are separated by words?

In [8]:
Patents.loc[(Patents["text"].str.contains('invention', case=False, regex=True))]

Unnamed: 0,uuid,patent_id,text,Embody
0,urfl2ulyjgez01g5selfflnz7,4491930,A better understanding of the invention may be...,False
1,nqdxwthotlcted3d961ao373x,4490979,A better understanding of the invention will b...,False
2,9mwinm7as0p0j3245tdxhfuiz,4491969,A better understanding of the invention will b...,False
3,l1n6w0ofqic6yow2t7qwmvqry,4490948,A better understanding of the present inventio...,False
4,86bndneq4omf3mfxi60dzr5mx,4491426,A better understanding of the present inventio...,False
...,...,...,...,...
8150,es2msvdn5pd8zxnxwrsmb3frd,4491306,While the specification concludes with claims ...,False
8151,bs3gaqy956cbznfcscxl8zrrj,4491812,While the specification concludes with claims ...,False
8153,wpvtafbht8yhm3lcvlc53r7hv,4491167,With the above and other objects and advantage...,True
8154,jwmtihlklmnjnjaok71ngr8ia,4490920,"With the foregoing and other objects in view, ...",True


In [9]:
Patents["invention"] = Patents["text"].str.contains('invention', case=False, regex=True)

In [10]:
Patents.head()

Unnamed: 0,uuid,patent_id,text,Embody,invention
0,urfl2ulyjgez01g5selfflnz7,4491930,A better understanding of the invention may be...,False,True
1,nqdxwthotlcted3d961ao373x,4490979,A better understanding of the invention will b...,False,True
2,9mwinm7as0p0j3245tdxhfuiz,4491969,A better understanding of the invention will b...,False,True
3,l1n6w0ofqic6yow2t7qwmvqry,4490948,A better understanding of the present inventio...,False,True
4,86bndneq4omf3mfxi60dzr5mx,4491426,A better understanding of the present inventio...,False,True


In [11]:
Patentsv1 = Patents[Patents["Embody"] & Patents["invention"]]
Patentsv1.head()

Unnamed: 0,uuid,patent_id,text,Embody,invention
6,hi5rsi8g5wx87octi0c6yqzfs,4491821,A better understanding of the present inventio...,True,True
7,0pw76mclimgppe77x6lzmfoa1,4491934,A better understanding of the present inventio...,True,True
25,6frag6oj4fnsef5yz99cf177d,4491019,A preferred embodiment of the invention is des...,True,True
26,iwrna7u45dat2kj8clsbwct5u,4491391,A preferred embodiment of the invention is sho...,True,True
27,fo6110e56ijjpu5f5x2jygqih,4491418,A preferred embodiment of the invention will n...,True,True


In [13]:
def is_embod_and_invention(text):
    return bool(re.search(r"embod", text) and re.search(r"invention", text) and
                re.search(r"embod", text).start() < re.search(r"invention", text).start())

Patents["is_embod_then_invention"] = Patents["text"].apply(is_embod_and_invention)
Patents

Unnamed: 0,uuid,patent_id,text,Embody,invention,is_embod_then_invention
0,urfl2ulyjgez01g5selfflnz7,4491930,A better understanding of the invention may be...,False,True,False
1,nqdxwthotlcted3d961ao373x,4490979,A better understanding of the invention will b...,False,True,False
2,9mwinm7as0p0j3245tdxhfuiz,4491969,A better understanding of the invention will b...,False,True,False
3,l1n6w0ofqic6yow2t7qwmvqry,4490948,A better understanding of the present inventio...,False,True,False
4,86bndneq4omf3mfxi60dzr5mx,4491426,A better understanding of the present inventio...,False,True,False
...,...,...,...,...,...,...
8151,bs3gaqy956cbznfcscxl8zrrj,4491812,While the specification concludes with claims ...,False,True,False
8152,nyj0iok0vet86cdkt0h70f53r,4491485,"With particular reference to FIG. 2, it is not...",False,False,False
8153,wpvtafbht8yhm3lcvlc53r7hv,4491167,With the above and other objects and advantage...,True,True,False
8154,jwmtihlklmnjnjaok71ngr8ia,4490920,"With the foregoing and other objects in view, ...",True,True,False


In [14]:
Patents.describe()

Unnamed: 0,uuid,patent_id,text,Embody,invention,is_embod_then_invention
count,8156,8156,8156,8156,8156,8156
unique,8156,1096,7722,2,2,2
top,urfl2ulyjgez01g5selfflnz7,4491287,BRIEF DESCRIPTION OF THE DRAWING,False,False,False
freq,1,59,120,6839,5859,7386


In [15]:
Patents["is_embod_then_invention"].value_counts()

False    7386
True      770
Name: is_embod_then_invention, dtype: int64

## Question 2 Answer

770 Cases of “embod-” appear with various endings with invention appearing afterword.