In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import itertools
import ast
from ast import literal_eval
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /nfshome/tc1767/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /nfshome/tc1767/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#output a directory for each patent's city and ipc section matched with its index number so that we can use the 
#index number to extract rows from dataframe for each city and ipc section
def get_directory(df):
    for i in range(len(df)):
        ipc = literal_eval(df['IPCs'][i])
        for j in range(len(ipc)):  
            sec = ipc[j]['ipc_section']
            try:
                asgn = literal_eval(df['assignees'][i])
                for k in range(len(asgn)):
                    yield (i, (asgn[k]['assignee_city'], sec))
            except SyntaxError:
                asgn = df.iloc[i]['assignees']
                for k in range(asgn.count('city')):
                    yield (i, (asgn.split(':')[2+k*3].split(',')[0][3:-1], sec))

In [None]:
#implement dataframe to get corresponding directory
directory = sorted(list(set(get_directory(df))), key=lambda x: x[0])

In [3]:
#output a list of tuples with unique values (city or ipc section) and corresponding number of patents
#num: 0-city, 1-ipc section
def count_unique(directory, num):
    counts = {}
    for tup in directory:
        counts[tup[1][num]] = counts.get(tup[1][num], 0) + 1
    return sorted(counts.items(), key=lambda x: x[1], reverse=True)

In [14]:
#df: compete dataframe with all patents
#directory: directory we get from above
#num: 0-city, 1-ipc section
#string: city or ipc section that we want to refine
def df_by_index(df, directory, num, string):
    return df.iloc[list(map(lambda x: x[0], filter(lambda x: x[1][num]==string, directory)))]

### Example using patents in 2008

In [4]:
pat08 = pd.read_csv("/projects/cps2019_funding/shared/Patents_Data/cityAnalysis/cleaned_patent_years/patents2008_full.csv")

#### Create a directory

In [5]:
directory = sorted(list(set(get_directory(pat08))), key=lambda x: x[0])

In [6]:
directory

[(0, (u'Atlanta', u'H')),
 (1, (u'Lake Zurich', u'B')),
 (2, (u'Richmond', u'G')),
 (3, (u'Hayward', u'A')),
 (4, (u'San Jose', u'H')),
 (4, (u'San Jose', u'C')),
 (5, (u'Bagsvaerd', u'C')),
 (5, (u'Franklinton', u'C')),
 (6, (u'San Rafael', u'G')),
 (7, (u'Milford', u'B')),
 (7, (u'Milford', u'G')),
 (7, (u'Milford', u'F')),
 (8, (u'San Jose', u'H')),
 (9, (u'Redlands', u'G')),
 (10, (u'Trophy Club', u'A')),
 (11, (u'Stanford', u'C')),
 (11, (u'Stanford', u'A')),
 (12, (u'San Francisco', u'H')),
 (12, (u'San Francisco', u'G')),
 (13, (u'Washington', u'A')),
 (14, (u'Santa Clara', u'G')),
 (15, (u'San Mateo', u'G')),
 (16, (u'Los Angeles', u'C')),
 (16, (u'Los Angeles', u'G')),
 (17, (u'San Diego', u'H')),
 (18, (u'Atlanta', u'H')),
 (19, (u'Kansas City', u'G')),
 (19, (u'Kansas City', u'B')),
 (20, (u'Austin', u'G')),
 (21, (u'Franklin Lakes', u'A')),
 (22, (u'Cincinnati', u'C')),
 (23, (u'Seattle', u'G')),
 (24, (u'Burbank', u'H')),
 (24, (u'Burbank', u'G')),
 (25, (u'San Diego', u'H

#### Count unique cities

In [7]:
count_unique(directory, 0)

[(u'Armonk', 9656),
 (u'Houston', 3845),
 (u'San Jose', 3513),
 (u'Santa Clara', 2989),
 (u'Redmond', 2853),
 (u'San Diego', 2618),
 (u'Sunnyvale', 2246),
 (u'Atlanta', 1783),
 (u'Wilmington', 1773),
 (u'Mountain View', 1592),
 (u'New York', 1501),
 (u'Detroit', 1492),
 (u'Chicago', 1361),
 (u'Irvine', 1349),
 (u'Cupertino', 1308),
 (u'Schenectady', 1252),
 (u'Austin', 1247),
 (u'St. Louis', 1108),
 (u'Washington', 1082),
 (u'Norwalk', 1082),
 (u'St. Paul', 1009),
 (u'Morristown', 995),
 (u'Dallas', 988),
 (u'Cambridge', 897),
 (u'Milpitas', 884),
 (u'Boise', 839),
 (u'Palo Alto', 810),
 (u'Minneapolis', 785),
 (u'Cincinnati', 756),
 (u'Reno', 712),
 (u'Rochester', 644),
 (u'San Francisco', 640),
 (u'Dearborn', 634),
 (u'Cleveland', 630),
 (u'Basking Ridge', 588),
 (u'Redwood City', 588),
 (u'Indianapolis', 588),
 (u'Boston', 569),
 (u'Oakland', 553),
 (u'Sugar Land', 549),
 (u'Fremont', 537),
 (u'Waltham', 496),
 (u'Seattle', 490),
 (u'Overland Park', 482),
 (u'Auburn Hills', 459),
 (

#### Count unique ipc sections

In [9]:
count_unique(directory, 1)

[(u'G', 41150),
 (u'H', 25955),
 (u'A', 16386),
 (u'B', 12088),
 (u'C', 10787),
 (None, 8537),
 (u'F', 6018),
 (u'E', 2680),
 (u'D', 511),
 (u'R', 1)]

#### Use directory to get desired dataframe

##### city='Armonk'

In [18]:
df_by_index(pat08, directory, 0, "Armonk")

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,IPCs,applications,assignees,cited_patents,cpcs,nbers,patent_abstract,patent_id,patent_type,uspcs,wipos,assignee_city,field,sector,ipc_section
28,28,28,"[{u'ipc_section': u'G', u'ipc_subclass': u'N',...","[{u'app_type': u'12', u'app_id': u'12/042434'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,[{u'cpc_subgroup_title': u'Digital computing o...,"[{u'nber_category_title': None, u'nber_subcate...",An instructional design tool is provided for d...,10095805,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
29,29,29,"[{u'ipc_section': u'G', u'ipc_subclass': u'Q',...","[{u'app_type': u'12', u'app_id': u'12/019004'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,[{u'cpc_subgroup_title': u'Administration; Man...,"[{u'nber_category_title': None, u'nber_subcate...",The present disclosure relates to methods and ...,10095990,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...",[{u'wipo_field_title': u'IT methods for manage...,Armonk,Electrical engineering,Electrical engineering,[u'G']
30,30,30,"[{u'ipc_section': u'G', u'ipc_subclass': u'Q',...","[{u'app_type': u'12', u'app_id': u'12/103472'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by examine...,"[{u'cpc_subgroup_title': u'Commerce, e.g. shop...","[{u'nber_category_title': None, u'nber_subcate...","In various embodiments, the proximity of an av...",10096032,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...",[{u'wipo_field_title': u'IT methods for manage...,Armonk,Electrical engineering,Electrical engineering,[u'G']
35,35,35,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'12', u'app_id': u'12/039690'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by examine...,[{u'cpc_subgroup_title': u'Arrangements for pr...,"[{u'nber_category_title': None, u'nber_subcate...",A method and system for integrated server-stor...,10108460,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
48,48,48,"[{u'ipc_section': u'H', u'ipc_subclass': u'L',...","[{u'app_type': u'12', u'app_id': u'12/031093'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by examine...,[{u'cpc_subgroup_title': u'Devices consisting ...,"[{u'nber_category_title': u'Elec', u'nber_subc...",A method for fabricating silicon-on-insulator ...,7384842,utility,[{u'uspc_mainclass_title': u'Active solid-stat...,"[{u'wipo_field_title': u'Semiconductors', u'wi...",Armonk,Electrical engineering,Electrical engineering,[u'H']
49,49,49,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'12', u'app_id': u'12/027015'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by other'}],[{u'cpc_subgroup_title': u'Administration; Man...,"[{u'nber_category_title': u'Cmp&Cmm', u'nber_s...",A system is provided for remotely configuring ...,7386595,utility,[{u'uspc_mainclass_title': u'Electrical comput...,[{u'wipo_field_title': u'Digital communication...,Armonk,Electrical engineering,Electrical engineering,[u'G']
50,50,50,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'11', u'app_id': u'11/969413'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by examine...,[{u'cpc_subgroup_title': u'Arrangements for pr...,"[{u'nber_category_title': u'Cmp&Cmm', u'nber_s...",A system for implementing dynamic lifetime rel...,7386851,utility,[{u'uspc_mainclass_title': u'Electrical comput...,"[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
51,51,51,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'12', u'app_id': u'12/045406'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by other'}],[{u'cpc_subgroup_title': u'Digital computing o...,"[{u'nber_category_title': u'Cmp&Cmm', u'nber_s...",A scale-out supercomputing environment include...,7389310,utility,"[{u'uspc_mainclass_title': u'Unclassified', u'...","[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
53,53,53,"[{u'ipc_section': u'G', u'ipc_subclass': u'K',...","[{u'app_type': u'11', u'app_id': u'11/971376'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by other'}...,[{u'cpc_subgroup_title': u'Details of televisi...,"[{u'nber_category_title': u'Cmp&Cmm', u'nber_s...",A digital camera system includes an input memb...,7391886,utility,"[{u'uspc_mainclass_title': u'Television', u'us...",[{u'wipo_field_title': u'Audio-visual technolo...,Armonk,Electrical engineering,Electrical engineering,[u'G']
54,54,54,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'11', u'app_id': u'11/970092'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by examine...,[{u'cpc_subgroup_title': u'Administration; Man...,"[{u'nber_category_title': u'Cmp&Cmm', u'nber_s...",A method for providing transparent participati...,7392290,utility,[{u'uspc_mainclass_title': u'Electrical comput...,[{u'wipo_field_title': u'IT methods for manage...,Armonk,Electrical engineering,Electrical engineering,[u'G']


##### ipc_section = 'A'

In [19]:
df_by_index(pat08, directory, 1, "A")

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,IPCs,applications,assignees,cited_patents,cpcs,nbers,patent_abstract,patent_id,patent_type,uspcs,wipos,assignee_city,field,sector,ipc_section
3,3,3,"[{u'ipc_section': u'A', u'ipc_subclass': u'F',...","[{u'app_type': u'12', u'app_id': u'12/112442'}]","[{u'assignee_key_id': u'64836', u'assignee_cit...",[{u'cited_patent_category': u'cited by applica...,[{u'cpc_subgroup_title': u'Surgical instrument...,"[{u'nber_category_title': None, u'nber_subcate...",Various embodiments are directed to base compo...,10022154,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Medical technology', ...",Hayward,Instruments,Instruments,[u'A']
10,10,10,"[{u'ipc_section': u'A', u'ipc_subclass': u'N',...","[{u'app_type': u'12', u'app_id': u'12/449358'}]","[{u'assignee_key_id': u'126165', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,"[{u'cpc_subgroup_title': u'Biocides, pest repe...","[{u'nber_category_title': None, u'nber_subcate...","Novel, environmentally safe compositions for t...",10045542,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...",[{u'wipo_field_title': u'Basic materials chemi...,Trophy Club,Chemistry,Chemistry,[u'A']
11,11,11,"[{u'ipc_section': u'C', u'ipc_subclass': u'K',...","[{u'app_type': u'12', u'app_id': u'12/522528'}]","[{u'assignee_key_id': u'184189', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,[{u'cpc_subgroup_title': u'Radiation therapy -...,"[{u'nber_category_title': None, u'nber_subcate...","Stimulation of target cells using light, e.g.,...",10052497,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Pharmaceuticals', u'w...",Stanford,Chemistry,Chemistry,"[u'A', u'C']"
13,13,13,"[{u'ipc_section': u'A', u'ipc_subclass': u'K',...","[{u'app_type': u'12', u'app_id': u'12/040610'}]","[{u'assignee_key_id': u'209055', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,[{u'cpc_subgroup_title': u'Medicinal preparati...,"[{u'nber_category_title': None, u'nber_subcate...",The present invention relates to a method of t...,10058520,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Pharmaceuticals', u'w...",Washington,Chemistry,Chemistry,[u'A']
21,21,21,"[{u'ipc_section': u'A', u'ipc_subclass': u'B',...","[{u'app_type': u'12', u'app_id': u'12/044469'}]","[{u'assignee_key_id': u'304189', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,"[{u'cpc_subgroup_title': None, u'cpc_category'...","[{u'nber_category_title': None, u'nber_subcate...",A needle assembly is disclosed. The needle ass...,10085680,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Medical technology', ...",Franklin Lakes,Instruments,Instruments,[u'A']
37,37,37,"[{u'ipc_section': u'A', u'ipc_subclass': u'B',...","[{u'app_type': u'12', u'app_id': u'12/102855'}]","[{u'assignee_key_id': u'284213', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,"[{u'cpc_subgroup_title': u'Detecting, measurin...","[{u'nber_category_title': None, u'nber_subcate...",Methods and apparatus for providing data proce...,10111608,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Medical technology', ...",Alameda,Instruments,Instruments,"[u'A', u'G']"
38,38,38,"[{u'ipc_section': u'A', u'ipc_subclass': u'M',...","[{u'app_type': u'12', u'app_id': u'12/047739'}]","[{u'assignee_key_id': u'184189', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,"[{u'cpc_subgroup_title': None, u'cpc_category'...","[{u'nber_category_title': None, u'nber_subcate...",The tissue therapy device includes a sealant l...,10117977,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': None, u'wipo_sector_tit...",Palo Alto,,,[u'A']
42,42,42,"[{u'ipc_section': u'A', u'ipc_subclass': u'B',...","[{u'app_type': u'12', u'app_id': u'12/072208'}]","[{u'assignee_key_id': u'391147', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,"[{u'cpc_subgroup_title': None, u'cpc_category'...","[{u'nber_category_title': None, u'nber_subcate...",An electrode array supports multiple electrode...,10130415,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': None, u'wipo_sector_tit...",Madison,,,[u'A']
45,45,45,"[{u'ipc_section': u'A', u'ipc_subclass': u'K',...","[{u'app_type': u'12', u'app_id': u'12/007902'}]","[{u'assignee_key_id': u'345704', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,"[{u'cpc_subgroup_title': None, u'cpc_category'...","[{u'nber_category_title': None, u'nber_subcate...",The present invention provides ophthalmic form...,10137083,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': None, u'wipo_sector_tit...",Tampa,,,[u'A']
60,60,60,"[{u'ipc_section': u'A', u'ipc_subclass': u'F',...","[{u'app_type': u'11', u'app_id': u'11/968703'}]","[{u'assignee_key_id': u'316699', u'assignee_ci...",[{u'cited_patent_category': u'cited by examine...,[{u'cpc_subgroup_title': u'Tobacco pipes -Hook...,"[{u'nber_category_title': u'Others', u'nber_su...",Disclosed is a hookah containment device that ...,7404405,utility,"[{u'uspc_mainclass_title': u'Tobacco', u'uspc_...",[{u'wipo_field_title': u'Other consumer goods'...,Sterling,Other fields,Other fields,[u'A']


In [1]:
! pwd

/projects/cps2019_funding/shared/Patents_Data/textAnalysis
