In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import itertools
import ast
from ast import literal_eval
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /nfshome/tc1767/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /nfshome/tc1767/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /nfshome/tc1767/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#output a directory for each patent's city and ipc section matched with its index number so that we can use the 
#index number to extract rows from dataframe for each city and ipc section
def get_directory(df):
    for i in range(len(df)):
        ipc = literal_eval(df['IPCs'][i])
        for j in range(len(ipc)):  
            sec = ipc[j]['ipc_section']
            try:
                asgn = literal_eval(df['assignees'][i])
                for k in range(len(asgn)):
                    yield (i, (asgn[k]['assignee_city'], sec))
            except SyntaxError:
                asgn = df.iloc[i]['assignees']
                for k in range(asgn.count('city')):
                    yield (i, (asgn.split(':')[2+k*3].split(',')[0][3:-1], sec))

In [3]:
#implement dataframe to get corresponding directory
directory = sorted(list(set(get_directory(df))), key=lambda x: x[0])

NameError: name 'df' is not defined

In [7]:
#output a list of tuples with unique values (city or ipc section) and corresponding number of patents
#num: 0-city, 1-ipc section
def count_unique(directory, num):
    counts = {}
    for tup in directory:
        counts[tup[1][num]] = counts.get(tup[1][num], 0) + 1
    return sorted(counts.items(), key=lambda x: x[1], reverse=True)

In [8]:
#df: compete dataframe with all patents
#directory: directory we get from above
#num: 0-city, 1-ipc section
#string: city or ipc section that we want to refine
def df_by_index(df, directory, num, string):
    return df.iloc[list(map(lambda x: x[0], filter(lambda x: x[1][num]==string, directory)))]

In [9]:
def df_by_2index(df, directory, city, field):
    cities = list(map(lambda x: x[0], filter(lambda x: x[1][0]==city, directory)))
    fields = list(map(lambda x: x[0], filter(lambda x: x[1][1]==field, directory)))
    return df.iloc[list(set(cities) & set(fields))]

#### IPC Sections:
A = Human Necessitites,  
B = Performing Operations; Transporint,   
C = Chemistry; Metallurgy,   
D = Textiles; paper,   
E = Fixed Constructing,   
F = Mechanical Engineering; Lighting; Heating; Weapons; Blasting Engines; Pumps,   
G = Physics,   
H = Electricity

In [10]:
def tokenize_words(df):
    stop_words = stopwords.words('english')
    allwords = []
    for i in list(df.index):
        try:
            words = [x.lower() for x in nltk.word_tokenize(df.loc[i]['patent_abstract']) if x.isalnum()]
            allwords.append([word for word in words if not word in stop_words])
        except:
            continue
    return list(itertools.chain.from_iterable(allwords))

In [11]:
def words_dict(df):
    rawcounts = dict(Counter([x for x in tokenize_words(df)]))
    pos = dict(nltk.pos_tag(rawcounts.keys()))    
    d = {}
    for key in rawcounts.keys():
        try:
            d.setdefault(key,[]).append(rawcounts[key])        
        except KeyError:
            pass

        try:
            d.setdefault(key,[]).append(pos[key])          
        except KeyError:
            pass
    return d

### For 2008

In [12]:
pat08 = pd.read_csv("/projects/cps2019_funding/shared/Patents_Data/cityAnalysis/cleaned_patent_years/patents2008_full.csv")

In [13]:
dir08 = sorted(list(set(get_directory(pat08))), key=lambda x: x[0])

In [14]:
top10 = list(map(lambda x: x[0], count_unique(dir08, 0)[:10]))

In [15]:
top10

['Armonk',
 'Houston',
 'San Jose',
 'Santa Clara',
 'Redmond',
 'San Diego',
 'Sunnyvale',
 'Atlanta',
 'Wilmington',
 'Mountain View']

In [16]:
ipcs = list(map(lambda x: x[0], count_unique(dir08, 1)))

In [17]:
ipcs

['G', 'H', 'A', 'B', 'C', None, 'F', 'E', 'D', 'R']

In [18]:
Armonk_G_08 = words_dict(df_by_2index(pat08, dir08, top10[0], ipcs[0]))

In [19]:
sorted(Armonk_G_08.items(),key=lambda x: x[1][0], reverse=True)

[('data', [9296, 'NNS']),
 ('system', [7377, 'NN']),
 ('one', [6189, 'CD']),
 ('method', [5956, 'NN']),
 ('first', [5224, 'RB']),
 ('second', [3549, 'JJ']),
 ('includes', [3441, 'VBZ']),
 ('user', [3240, 'RBR']),
 ('computer', [3069, 'NN']),
 ('least', [2858, 'JJS']),
 ('memory', [2837, 'NN']),
 ('information', [2788, 'NN']),
 ('may', [2687, 'MD']),
 ('plurality', [2487, 'NN']),
 ('set', [2476, 'VBD']),
 ('device', [2470, 'NN']),
 ('processing', [2295, 'NN']),
 ('storage', [2252, 'NN']),
 ('associated', [2245, 'VBN']),
 ('program', [2058, 'NN']),
 ('provided', [2055, 'VBD']),
 ('application', [2049, 'NN']),
 ('based', [2009, 'VBN']),
 ('request', [1528, 'NN']),
 ('server', [1503, 'NN']),
 ('within', [1468, 'IN']),
 ('using', [1461, 'VBG']),
 ('virtual', [1446, 'JJ']),
 ('object', [1404, 'NN']),
 ('access', [1403, 'NN']),
 ('processor', [1351, 'NN']),
 ('process', [1348, 'NN']),
 ('value', [1336, 'NN']),
 ('unit', [1336, 'NN']),
 ('response', [1322, 'NN']),
 ('time', [1314, 'NN']),
 ('c

In [21]:
df_by_2index(pat08, dir08, top10[0], ipcs[0])

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,IPCs,applications,assignees,cited_patents,cpcs,inventors,nbers,patent_abstract,patent_id,patent_type,uspcs,wipos,assignee_city,field,sector,ipc_section
65556,65556,1620,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'12', u'app_id': u'12/189594'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,"[{u'cpc_subgroup_title': u'Accessing, addressi...","[{u'inventor_key_id': u'20711', u'inventor_cit...","[{u'nber_category_title': None, u'nber_subcate...","A method, system, and computer program product...",9430395,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
98325,98325,9200,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'12', u'app_id': u'12/343021'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,[{u'cpc_subgroup_title': u'Digital computing o...,"[{u'inventor_key_id': u'10669', u'inventor_cit...","[{u'nber_category_title': u'Cmp&Cmm', u'nber_s...",A method and system are disclosed for use with...,8744994,utility,[{u'uspc_mainclass_title': u'Data processing: ...,"[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
28,28,28,"[{u'ipc_section': u'G', u'ipc_subclass': u'N',...","[{u'app_type': u'12', u'app_id': u'12/042434'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,[{u'cpc_subgroup_title': u'Digital computing o...,"[{u'inventor_key_id': u'9872', u'inventor_city...","[{u'nber_category_title': None, u'nber_subcate...",An instructional design tool is provided for d...,10095805,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
29,29,29,"[{u'ipc_section': u'G', u'ipc_subclass': u'Q',...","[{u'app_type': u'12', u'app_id': u'12/019004'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,[{u'cpc_subgroup_title': u'Administration; Man...,"[{u'inventor_key_id': u'202058', u'inventor_ci...","[{u'nber_category_title': None, u'nber_subcate...",The present disclosure relates to methods and ...,10095990,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...",[{u'wipo_field_title': u'IT methods for manage...,Armonk,Electrical engineering,Electrical engineering,[u'G']
98334,98334,9209,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'12', u'app_id': u'12/334088'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by applica...,[{u'cpc_subgroup_title': u'Arrangements for pr...,"[{u'inventor_key_id': u'5805', u'inventor_city...","[{u'nber_category_title': u'Cmp&Cmm', u'nber_s...",A method is presented for executing complex op...,8745510,utility,[{u'uspc_mainclass_title': u'Data processing: ...,"[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
30,30,30,"[{u'ipc_section': u'G', u'ipc_subclass': u'Q',...","[{u'app_type': u'12', u'app_id': u'12/103472'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by examine...,"[{u'cpc_subgroup_title': u'Commerce, e.g. shop...","[{u'inventor_key_id': u'9507', u'inventor_city...","[{u'nber_category_title': None, u'nber_subcate...","In various embodiments, the proximity of an av...",10096032,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...",[{u'wipo_field_title': u'IT methods for manage...,Armonk,Electrical engineering,Electrical engineering,[u'G']
35,35,35,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'12', u'app_id': u'12/039690'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by examine...,[{u'cpc_subgroup_title': u'Arrangements for pr...,"[{u'inventor_key_id': u'222902', u'inventor_ci...","[{u'nber_category_title': None, u'nber_subcate...",A method and system for integrated server-stor...,10108460,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
65580,65580,1644,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'12', u'app_id': u'12/181908'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by examine...,[{u'cpc_subgroup_title': u'Arrangements for pr...,"[{u'inventor_key_id': u'1323365', u'inventor_c...","[{u'nber_category_title': None, u'nber_subcate...",Schema information is provided with transmissi...,9448812,utility,"[{u'uspc_mainclass_title': None, u'uspc_sequen...","[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
49,49,49,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'12', u'app_id': u'12/027015'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by other'}],[{u'cpc_subgroup_title': u'Administration; Man...,"[{u'inventor_key_id': u'2412288', u'inventor_c...","[{u'nber_category_title': u'Cmp&Cmm', u'nber_s...",A system is provided for remotely configuring ...,7386595,utility,[{u'uspc_mainclass_title': u'Electrical comput...,[{u'wipo_field_title': u'Digital communication...,Armonk,Electrical engineering,Electrical engineering,[u'G']
50,50,50,"[{u'ipc_section': u'G', u'ipc_subclass': u'F',...","[{u'app_type': u'11', u'app_id': u'11/969413'}]","[{u'assignee_key_id': u'276821', u'assignee_ci...",[{u'cited_patent_category': u'cited by examine...,[{u'cpc_subgroup_title': u'Arrangements for pr...,"[{u'inventor_key_id': u'181551', u'inventor_ci...","[{u'nber_category_title': u'Cmp&Cmm', u'nber_s...",A system for implementing dynamic lifetime rel...,7386851,utility,[{u'uspc_mainclass_title': u'Electrical comput...,"[{u'wipo_field_title': u'Computer technology',...",Armonk,Electrical engineering,Electrical engineering,[u'G']
