# Silver Layer(1) - Data Cleaning 

In [1]:
## To display notebook cell with horizontal scroll bar
from IPython.display import display,HTML
display(HTML("<style>pre {white-space:pre !important;}</style> "))

In [2]:
from pyspark.sql.functions import lit
import findspark
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.types import StringType

In [3]:
findspark.init()

spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext

In [4]:
patents = spark.read.json('raw-data/patents.json',multiLine=True)

In [5]:
patents.show()

+---------------------------------+--------------------+----------------+---------------------+------------------+-----------------------+------------------------+--------------+-----------+----------------------+-----------------------+----------+--------------------+--------------------+-------------+----------+------+--------------------+
|                    abstract_text|          applicants|application date|assignee_name_current|assignee_name_orig|backward_cite_no_family|backward_cite_yes_family|          code|filing_date|forward_cite_no_family|forward_cite_yes_family|grant_date|       inventor_name|                link|priority_date|  pub_date|source|               title|
+---------------------------------+--------------------+----------------+---------------------+------------------+-----------------------+------------------------+--------------+-----------+----------------------+-----------------------+----------+--------------------+--------------------+-------------+--------

In [6]:
patents.printSchema()

root
 |-- abstract_text: string (nullable = true)
 |-- applicants: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- application date: string (nullable = true)
 |-- assignee_name_current: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- assignee_name_orig: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- backward_cite_no_family: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- patent_number: string (nullable = true)
 |    |    |-- priority_date: string (nullable = true)
 |    |    |-- pub_date: string (nullable = true)
 |-- backward_cite_yes_family: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- patent_number: string (nullable = true)
 |    |    |-- priority_date: string (nullable = true)
 |    |    |-- pub_date: string (nullable = true)
 |-- code: string (nullable = true)
 |-- filing_date: string (nullable = true)
 |-- forward

In [12]:
patents.show(2)

+---------------------------------+--------------------+----------------+---------------------+------------------+-----------------------+------------------------+--------------+-----------+----------------------+-----------------------+----------+--------------------+--------------------+-------------+----------+------+--------------------+
|                    abstract_text|          applicants|application date|assignee_name_current|assignee_name_orig|backward_cite_no_family|backward_cite_yes_family|          code|filing_date|forward_cite_no_family|forward_cite_yes_family|grant_date|       inventor_name|                link|priority_date|  pub_date|source|               title|
+---------------------------------+--------------------+----------------+---------------------+------------------+-----------------------+------------------------+--------------+-----------+----------------------+-----------------------+----------+--------------------+--------------------+-------------+--------

In [48]:
patents.createOrReplaceTempView("patents")

In [76]:
duplicate_counts = spark.sql("""
    SELECT code,count(*) AS count
    FROM patents
    GROUP BY code
    HAVING COUNT(*) > 2
""")

CN107918755A

In [77]:
duplicate_counts.show(100)

+--------------+-----+
|          code|count|
+--------------+-----+
|WO2022105607A1|    3|
|  CN107609009A|    3|
|  CN112818008A|    3|
|  CN111047094A|    3|
|  CN109308411B|    3|
|  CN109871660B|    3|
|  CN111537515B|    3|
|  CN115240093A|    6|
|  CN112364641A|    3|
|  CN114092769A|    4|
|  CN115018804A|    3|
|  CN112383052B|    3|
|  CN113868497A|    3|
|  CN111115727A|    3|
|  CN113362959A|    3|
| JP2019057160A|    4|
|  CN111143447A|    4|
|WO2022100357A1|    3|
|  TW202111613A|    3|
|  CN112580331A|    3|
|  CN114119058B|    3|
|  CN115563859A|    4|
|  CN116933175A|    3|
|  CN111325858B|    4|
|  CN115455811A|    3|
|  US10690646B2|   16|
|  CN107918755A|    3|
|WO2021164137A1|    3|
|  CN110717403B|    3|
|  CN110728151A|    3|
|  CN112101789A|    3|
|   RU2310237C1|    3|
|  CN110263794B|    3|
|  CN113034593A|    3|
|   JP6953508B2|    3|
|  CN114998934A|    4|
|  CN114462624A|    3|
| KR102095555B1|    3|
|WO2021103492A1|    5|
|  CN109741748A|    3|
|  CN115238

In [65]:
duplicate_counts.createOrReplaceTempView("duplicated_ids")

In [73]:
ops_patents = spark.sql("""
    SELECT * 
    FROM patents 
    WHERE patents.code IN ( SELECT code 
                            FROM patents
                            GROUP BY code
                            HAVING COUNT(*) > 2)
    AND source = 'OPS'
""")


In [74]:
ops_patents.show()

+--------------------+--------------------+----------------+---------------------+------------------+-----------------------+------------------------+------------+-----------+----------------------+-----------------------+----------+--------------------+--------------------+-------------+----------+------+--------------------+
|       abstract_text|          applicants|application date|assignee_name_current|assignee_name_orig|backward_cite_no_family|backward_cite_yes_family|        code|filing_date|forward_cite_no_family|forward_cite_yes_family|grant_date|       inventor_name|                link|priority_date|  pub_date|source|               title|
+--------------------+--------------------+----------------+---------------------+------------------+-----------------------+------------------------+------------+-----------+----------------------+-----------------------+----------+--------------------+--------------------+-------------+----------+------+--------------------+
|The inventio

In [86]:
t = spark.sql("""
    SELECT *
    FROM patents
    where code ='CN107918755A'
""")

In [88]:
t.show(truncate=True)

+--------------------+--------------------+----------------+---------------------+--------------------+-----------------------+------------------------+------------+-----------+----------------------+-----------------------+----------+-------------------------------+--------------------+-------------+----------+-------------+--------------------+
|       abstract_text|          applicants|application date|assignee_name_current|  assignee_name_orig|backward_cite_no_family|backward_cite_yes_family|        code|filing_date|forward_cite_no_family|forward_cite_yes_family|grant_date|                  inventor_name|                link|priority_date|  pub_date|       source|               title|
+--------------------+--------------------+----------------+---------------------+--------------------+-----------------------+------------------------+------------+-----------+----------------------+-----------------------+----------+-------------------------------+--------------------+-------------+

In [99]:
patents.count()

8426

In [101]:
patents = spark.sql("""
    select *  
    FROM patents
    where code != 'CN107918755A' or source != 'google patent' 
""")

In [100]:
patents.count()

8424

In [104]:
patents.show(5)

+---------------------------------+--------------------+----------------+---------------------+------------------+-----------------------+------------------------+--------------+-----------+----------------------+-----------------------+----------+--------------------+--------------------+-------------+----------+------+--------------------+
|                    abstract_text|          applicants|application date|assignee_name_current|assignee_name_orig|backward_cite_no_family|backward_cite_yes_family|          code|filing_date|forward_cite_no_family|forward_cite_yes_family|grant_date|       inventor_name|                link|priority_date|  pub_date|source|               title|
+---------------------------------+--------------------+----------------+---------------------+------------------+-----------------------+------------------------+--------------+-----------+----------------------+-----------------------+----------+--------------------+--------------------+-------------+--------

In [112]:
patents = patents.dropDuplicates(['code'])

In [113]:
patents.count()

7938

In [114]:
patents.printSchema()

root
 |-- abstract_text: string (nullable = true)
 |-- applicants: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- application date: string (nullable = true)
 |-- assignee_name_current: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- assignee_name_orig: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- backward_cite_no_family: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- patent_number: string (nullable = true)
 |    |    |-- priority_date: string (nullable = true)
 |    |    |-- pub_date: string (nullable = true)
 |-- backward_cite_yes_family: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- patent_number: string (nullable = true)
 |    |    |-- priority_date: string (nullable = true)
 |    |    |-- pub_date: string (nullable = true)
 |-- code: string (nullable = true)
 |-- filing_date: string (nullable = true)
 |-- forward

#### Add the country

In [118]:
from pyspark.sql.functions import substring

In [119]:
df_with = patents.withColumn("country", substring(patents["code"], 1, 2))

In [121]:
df_with.printSchema()

root
 |-- abstract_text: string (nullable = true)
 |-- applicants: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- application date: string (nullable = true)
 |-- assignee_name_current: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- assignee_name_orig: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- backward_cite_no_family: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- patent_number: string (nullable = true)
 |    |    |-- priority_date: string (nullable = true)
 |    |    |-- pub_date: string (nullable = true)
 |-- backward_cite_yes_family: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- patent_number: string (nullable = true)
 |    |    |-- priority_date: string (nullable = true)
 |    |    |-- pub_date: string (nullable = true)
 |-- code: string (nullable = true)
 |-- filing_date: string (nullable = true)
 |-- forward

In [123]:
from pyspark.sql.functions import regexp_extract

In [130]:
patents = patents.withColumn("country", regexp_extract(patents["code"], "^[a-zA-Z]+", 0))

In [131]:
patents.count()

7938

In [132]:
patents.show()

+--------------------+--------------------+----------------+---------------------+--------------------+-----------------------+------------------------+--------------+-----------+----------------------+-----------------------+----------+--------------------------+--------------------+-------------+----------+-------------+--------------------+-------+
|       abstract_text|          applicants|application date|assignee_name_current|  assignee_name_orig|backward_cite_no_family|backward_cite_yes_family|          code|filing_date|forward_cite_no_family|forward_cite_yes_family|grant_date|             inventor_name|                link|priority_date|  pub_date|       source|               title|country|
+--------------------+--------------------+----------------+---------------------+--------------------+-----------------------+------------------------+--------------+-----------+----------------------+-----------------------+----------+--------------------------+--------------------+-------

In [135]:
patents_1 = patents.coalesce(1)

In [136]:
patents_1.write.json('transformed-data/cleaned_patents.json')

### next step : data transformation