# Cleaning Merged Patents Data and Split for Model Training/Testing

### Outline:

- Drop redundant columns
- Rename columns
- Add key features
- Clean University Assignment Features
- Data Dictionary
- Split Data
- Save Data

In [45]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os

In [2]:
#os.getcwd()

In [3]:
file_location =  '/Users/chewy2.0/Capstone/data' ##change as necessaru
os.chdir(file_location)

In [4]:
patents_full = pd.read_csv('PATENTS_DATA_ei.csv', dtype = {'GEOID':'str'}, low_memory = False)

In [5]:
patents_full.columns

Index(['patent_number', 'assignee', 'grant_year', 'application_year',
       'application_number', 'GEOID', 'ipc_section', 'team_size', 'inventors',
       'men_inventors', 'women_inventors', 'already_granted',
       'assignee_uni_clean2', 'Institution', 'Control', 'level_r1', 'level_r2',
       'special_focus', 'Perc_Over25_LessNinthGrade',
       'Perc_Over25_SomeHighSchool', 'Perc_Over25_HighSchoolGrad',
       'Perc_Over25_SomeCollege', 'Perc_Over25_Assosciates',
       'Perc_Over25_Bachelors', 'Perc_Over25_Graduate', 'bea_region',
       'Agriculture_Forestry_Fishing_Hunting',
       'Mining_Quarrying_and_Oil_Gas_Extraction', 'Utilities', 'Construction',
       'Manufacturing', 'Wholesale_Trade', 'Retail_Trade',
       'Transportation_Warehousing', 'Information', 'Finance_Insurance',
       'Real_Estate_Rental_Leasing',
       'Professional_Scientific_and_Technical_Services',
       'Management_of_Companies_Enterprises',
       'Administrative_Support_Waste_Management_Remediation

In [6]:
patents_full.dtypes

patent_number                  object
assignee                       object
grant_year                    float64
application_year                int64
application_number              int64
                               ...   
Wage_and_salary_employment      int64
Wages_and_salaries              int64
Population_Estimate             int64
Applications                    int64
Award_Amount                  float64
Length: 98, dtype: object

## Dropping Redundant Columns

In [6]:
patents_full.drop(['year','inventors'], axis = 1, inplace = True)

## Renaming Columns

In [7]:
patents_full.rename(columns = 
                      {'patent_number':'patent_num',
                      'grant_year':'grant_yr',
                      'application_year':'app_yr',
                      'application_number':'app_num',
                      'ipc_section':'ipc',
                      'level_r1':'r1',
                      'level_r2':'r2',
                      'Perc_Over25_LessNinthGrade':'Over25_Less9Grade',
                      'Perc_Over25_SomeHighSchool':'Over25_SomeHS',
                      'Perc_Over25_HighSchoolGrad':'Over25_HSGrad', 
                      'Perc_Over25_SomeCollege':'Over25_SomeCollege',
                      'Perc_Over25_Assosciates':'Over25_Assosc',
                      'Perc_Over25_Bachelors':'Over25_Bach',
                      'Perc_Over25_Graduate':'Over25_Grad',
                      'assignee_uni_clean2':'assignee_univ_map'}, inplace = True)

## Creating Necessary Features

### Women Involvement in Patent

In [8]:
patents_full['women_involved'] = np.where(patents_full['women_inventors'] > 0, 1, 0)

In [9]:
patents_full['women_involved'].value_counts()

0    1303441
1     412278
Name: women_involved, dtype: int64

### Normalize some columns - THIS IS NOT CENTERING AND SCALING -

#### GDP by labor force

In [10]:
# Fixing an issue with how mhk managed the bea/census enumerations...Should go back and fix in the merges...
patents_full['pop_gt_16_lf'] = np.where(patents_full['GEOID'] == '15005', 87051, patents_full['pop_gt_16_lf'])
patents_full.query("GEOID != '15005'" )['pop_gt_16_lf']
patents_full['GDP_by_labor_force'] = patents_full['GDP']/patents_full['pop_gt_16_lf']

### Annual Payrol by number of establishments

In [11]:
patents_full['ap_by_est'] = patents_full['ap']/patents_full['est']
# patents_full.hist(column = 'ap_by_est', bins = 100)
# Show all columns in pandas
# pd.set_option('display.max_columns', None)
# patents_full.sort_values(by = 'ap_by_est', ascending = False).head(5)

### Establishments by Labor Force > 16

In [12]:
patents_full['est_by_pop_gt_16_lf'] = patents_full['est']/patents_full['pop_gt_16_lf']
# patents_full.hist(column = 'est_by_pop_gt_16_lf', bins = 100)
# patents_full.boxplot(column = 'est_by_pop_gt_16_lf')
# patents_full.plot.scatter(x = 'est', y = 'pop_gt_16_lf')
# patents_full.sort_values(by = 'est_by_pop_gt_16_lf', ascending = False).head(5)
# patents_full[['est_by_pop_gt_16_lf', 'est', 'pop_gt_16_lf', 'GEOID']].sort_values(by = 'est_by_pop_gt_16_lf', ascending = False).head(5)

In [14]:
# patents_full.plot.scatter(x = 'Pop_Est', y = 'Over25_Grad')
# patents_full.boxplot(column = 'Over25_Grad')
# patents_full.groupby(["GEOID",'Over25_Grad']).size().reset_index().sort_values(by = 'Over25_Grad', ascending = False).head(20)
# sort_values(by = 'Over25_Grad', ascending = False).head(10)

## Cleaning University Assignments

Binary classification if research university or not, for three types of research universities

In [13]:
patents_full['r1'].fillna(0, inplace = True)
patents_full['r2'].fillna(0, inplace = True)
patents_full['special_focus'].fillna(0, inplace = True)

In [14]:
patents_full[['r1','r2','special_focus']] = patents_full[['r1','r2','special_focus']].astype('int')

In [17]:
#patents_full.columns

In [18]:
#patents_full.already_granted

In [15]:
# patents_full[['GEOID', 'app_yr', 'Pop_Est' ,'Population_Estimate', 'Total_employment', 'pop_gt_16_lf']]
patents_full['diff'] = patents_full['Pop_Est'] - patents_full['Population_Estimate']
patents_full.sort_values(by = 'diff', ascending = False).tail(5)

Unnamed: 0,patent_num,assignee,grant_yr,app_yr,app_num,GEOID,ipc,team_size,men_inventors,women_inventors,...,Wage_and_salary_employment,Wages_and_salaries,Population_Estimate,Applications,Award_Amount,women_involved,GDP_by_labor_force,ap_by_est,est_by_pop_gt_16_lf,diff
584082,,"HALLIBURTON ENERGY SERVICES, INC.",,2015,15748674,48201,,3,3,0,...,2436066,167629149,4557846,46890,36531223.0,0,165.235782,1352.285046,0.044016,-201484
584083,,"Hewlett-Packard Development Company, L.P.",,2015,15748614,48201,,1,0,1,...,2436066,167629149,4557846,46890,36531223.0,1,165.235782,1352.285046,0.044016,-201484
584084,,"Hewlett-Packard Development Company, L.P.",,2015,15748611,48201,,3,2,1,...,2436066,167629149,4557846,46890,36531223.0,1,165.235782,1352.285046,0.044016,-201484
584085,,"Hewlett-Packard Development Company, L.P.",,2015,15748870,48201,,3,3,0,...,2436066,167629149,4557846,46890,36531223.0,0,165.235782,1352.285046,0.044016,-201484
583570,,"Hewlett-Packard Development Company, L.P.",,2015,14864292,48201,,4,2,2,...,2436066,167629149,4557846,46890,36531223.0,1,165.235782,1352.285046,0.044016,-201484


## Data Dictionary

- patent_num: Patent Number
- assignee: Assignee
- grant_yr: Grant Year
- app_yr: Application Year
- app_num: Application Number
- GEOID: GEOID
- ipc: International Patent Application
- team_size: Number of inventors
- men_inventors: Number of men inventors for patent
- women_inventors: Number of women inventors for patent
- already_granted: 1/0 classification, 1 is patent has been granted
- assignee_univ_map: If assignee is a resaerch university, this is what official university name it is mapped to
- Institution: If assigne is research university, the name of university
- Control: Public or private university
- r1: 1/0 classification, 1 if assignee is r1 research university: Very high research activity
- r2: 1/0 classification, 1 if assignee is r2 research university: High research activity
- special_focus: 1/0 classification, 1 if research university that only awards degrees in one area
- Over25_Less9Grade: % of GEOID over 25 years with less than 9th grade education
- Over25_SomeHS': % of GEOID over 25 years with some high school education
- Over25_HSGrad': % of GEOID over 25 years with high school diploma or equivalent
- Over25_SomeCollege': % of GEOID over 25 years with some college education
- Over25_Assosc': % of GEOID over 25 years with assosciate's degree
- Over25_Bach': % of GEOID over 25 years with bachelor's degree 
- Over25_Grad': % of GEOID Over 25 years with graduate degree
- bea_region: Bureau of Economic Analysis Region (https://www.icip.iastate.edu/maps/refmaps/bea)
- North American Industrial Classification (NAICS) Code Location Quotient: (https://www.census.gov/programs-surveys/cbp/data/datasets.html)
  - Agriculture_Forestry_Fishing_Hunting
  - Mining_Quarrying_and_Oil_Gas_Extraction
  - Utilities
  - Construction   
  - Manufacturing  
  - Wholesale_Trade  
  - Retail_Trade
  - Transportation_Warehousing
  - Information
  - Finance_Insurance
  - Real_Estate_Rental_Leasing
  - Professional_Scientific_and_Technical_Services
  - Management_of_Companies_Enterprises
  - Administrative_Support_Waste_Management_Remediation_Services
  - Educational_Services
  - Health_Care_Social_Assistance
  - Arts_Entertainment_and_Recreation 
  - Accommodation_Food_Services
  - Other_Services_except_Public_Administration
  - Agriculture_Forestry_Fishing_Hunting_base (1 in base 0 in non-base)
  - Mining_Quarrying_and_Oil_Gas_Extraction_base (1 in base 0 in non-base)
  - Utilities_base (1 in base 0 in non-base)
  - Construction_base (1 in base 0 in non-base)
  - Manufacturing_base (1 in base 0 in non-base)
  - Wholesale_Trade_base (1 in base 0 in non-base)
  - Retail_Trade_base (1 in base 0 in non-base)
  - Transportation_Warehousing_base (1 in base 0 in non-base)
  - Information_base (1 in base 0 in non-base)
  - Finance_Insurance_base (1 in base 0 in non-base)
  - Real_Estate_Rental_Leasing_base (1 in base 0 in non-base)
  - Professional_Scientific_and_Technical_Services_base (1 in base 0 in non-base)
  - Management_of_Companies_Enterprises_base (1 in base 0 in non-base)
  - Administrative_Support_Waste_Management_Remediation_Services_base (1 in base 0 in non-base)
  - Educational_Services_base  (1 in base 0 in non-base)
  - Health_Care_Social_Assistance_base (1 in base 0 in non-base)
  - Arts_Entertainment_and_Recreation_base (1 in base 0 in non-base)
  - Accommodation_Food_Services_base (1 in base 0 in non-base)
  - Other_Services_except_Public_Administration_base (1 in base 0 in non-base)
- Other County Business Patterns Data ((https://www.census.gov/programs-surveys/cbp/data/datasets.html))
  - qp1: County Business Patterns (CBP) 1st Quarter Payroll
  - ap: County Business Patterns (CBP) annual Payroll
  - est: County Business Patterns (CBP) number of establishments
  - ap_by_est: ap/est
  - est_by_pop_gt_16_lf: est/labor force over 16 years
- GDP: Gross Domestic Product (GDP): All industry total in current dollars (Thousands of dollars) (https://apps.bea.gov/itable/iTable.cfm?ReqID=70&step=1&acrdn=5#reqid=70)
- ACS DP03 SELECTED ECONOMIC CHARACTERISTICS (ACS DP03 2010-2022) (https://data.census.gov/cedsci/table?q=labor%20force&t=Employment%20and%20Labor%20Force%20Status&g=0100000US%240500000&tid=ACSDP5Y2020.DP03)
  - pop_gt_16: American Community Survey (ACS) population over 16 years (field:DP03_0001E)
  - pop_gt_16_lf: American Community Survey (ACS) population over 16 years in the labor force (field:DP03_0002E )
  - pop_gt_16_lf_c: American Community Survey (ACS) population over 16 years in the civilian labor force (field:DP03_0003E)
- Pop_Est: American Community Survey (ACS) population estimate (ACS DT5Y 2010-2022 , Field:B01003, Source : https://data.census.gov/cedsci/table?q=population&t=Populations%20and%20People&g=0100000US%240500000&tid=ACSST5Y2020.S0101)
- women_involved: 1/0 classification, 1 if woman is on the team
- Economic Indicators:
  - Earnings_by_place_of_work
  - Employee_and_self-employed_contributions_for_government_social_insurance
  - Employer_contributions_for_employee_pension_and_insurance_funds
  - Employer_contributions_for_government_social_insurance
  - Equals_Net_earnings_by_place_of_residence
  - Farm_income
  - Farm_proprietors_income
  - Less_Contributions_for_government_social_insurance
  - Nonfarm_personal_income
  - Nonfarm_proprietors_income
  - Per_capita_personal_income_dollars
  - Personal_income_thousands_of_dollars
  - Plus_Adjustment_for_residence
  - Plus_Dividends_interest_and_rent
  - Plus_Personal_current_transfer_receipts
  - Population_persons
  - Proprietors_employment
  - Proprietors_income
  - Supplements_to_wages_and_salaries
  - Total_employment
  - Wage_and_salary_employment
  - Wages_and_salaries
  - Population_Estimate
  - Applications
  - Award_Amount

- Categorical:
	- ipc - Mulitiple IPC in one column [b,c,d ] * this data is only available for granted patents.
	- already_granted - Done
	- Control - Public  = 0 : Private = 1
	- r1
	- r2
	- special_focus
	- bea_region
	- Agriculture_Forestry_Fishing_Hunting_base
	- Mining_Quarrying_and_Oil_Gas_Extraction_base
	- Utilities_base
	- Construction_base
	- Manufacturing_base
	- Wholesale_Trade_base
	- Retail_Trade_base
	- Transportation_Warehousing_base
	- Information_base
	- Finance_Insurance_base
	- Real_Estate_Rental_Leasing_base
	- Professional_Scientific_and_Technical_Services_base
	- Management_of_Companies_Enterprises_base
	- Administrative_Support_Waste_Management_Remediation_Services_base
	- Educational_Services_base
	- Health_Care_Social_Assistance_base
	- Arts_Entertainment_and_Recreation_base
	- Accommodation_Food_Services_base
	- Other_Services_except_Public_Administration_base

- Numeric:
	- team_size
	- men_inventors
	- women_inventors
	- Over25_Less9Grade
	- Over25_SomeHS
	- Over25_HSGrad
	- Over25_SomeCollege
	- Over25_Assosc
	- Over25_Bach
	- Over25_Grad	
	- ap_by_est
	- pop_gt_16_lf
	- est_by_pop_gt_16_lf
	- Total_employment: A count of jobs, both full-time and part-time. It includes wage and salary jobs, sole proprietorships, and individual general partners, but not unpaid family workers nor volunteers.
	- Pop_Est
	- Population_Estimate
	- Population_persons: The number of individuals (both civilian and military) who reside in a given area.
	- Earnings_by_place_of_work: Consists of compensation of employees and proprietors' income.
	- Employee_and_self-employed_contributions_for_government_social_insurance: Consists of the contributions, or payments, by employees, by the self-employed, and by other individuals who participate in the following government programs: old-age, survivors, and disability insurance (Social Security); hospital insurance (Medicare Part A); supplementary medical insurance (Medicare Parts B and D); unemployment insurance; railroad retirement; veterans' life insurance; and temporary disability insurance.
	- Employer_contributions_for_employee_pension_and_insurance_funds: Consists of employer payments to private and government pension plans and to private insurance funds such as for group health and life insurance; workers' compensation; and supplemental unemployment insurance.
	- Employer_contributions_for_government_social_insurance: Consists of employer payments under the following federal, state, and local government programs: old-age, survivors, and disability insurance; hospital insurance; unemployment insurance; railroad retirement; pension benefit guaranty; veterans' life insurance; publicly administered workers' compensation; military medical insurance; and temporary disability insurance.
	- Equals_Net_earnings_by_place_of_residence: Consists of earnings by place of work less contributions for government social insurance plus the adjustment for residence.
	- Farm_income: Consists of wages and salaries, employer contributions for employee pension and insurance funds, and proprietors' income in the farm industry (NAICS subsectors 111-Crop Production and 112-Animal Production). Farm personal income comprises the net personal income of sole proprietors, partners, and hired laborers arising directly from the current production of agricultural commodities, both livestock and crops. It excludes corporate farm income.
	- Farm_proprietors_income: Farm proprietors' income consists of the income that is received by the sole proprietorships and the partnerships that operate farms. It excludes the income that is received by corporate farms.
	- Less_Contributions_for_government_social_insurance
	- Nonfarm_personal_income: Nonfarm personal income is personal income minus farm income.
	- Nonfarm_proprietors_income: Nonfarm Proprietors' Income consists of the income that is received by nonfarm sole proprietorships and partnerships and the income that is received by tax-exempt cooperatives.  The national estimates of nonfarm proprietors' income are primarily derived from income tax data. Because these data do not always reflect current production and because they are incomplete, the estimates also include four major adjustments--the inventory valuation adjustment, the capital consumption adjustment, the "misreporting" adjustment, and the adjustment for the net margins on owner-built housing. The inventory valuation adjustment offsets the effects of the gains and the losses that result from changes in the prices of products withdrawn from inventories; this adjustment for recent years has been small, but it is important to the definition of proprietors' income. The capital consumption adjustment changes the value of the consumption, or depreciation, of fixed capital from the historical-cost basis used in the source data to a replacement-cost basis. The "misreporting" adjustment adds an estimate of the income of sole proprietors and partnerships that is not reported on tax returns. The adjustment for the net margins on owner-built housing is an addition to the estimate for the construction industry. It is the imputed net income of individuals from the construction or renovation of their own dwellings.  The source data necessary to prepare these adjustments are available only at the national level. Therefore, the national estimates of nonfarm proprietors' income that include the adjustments are allocated to states, and these state estimates are allocated to the counties, in proportion to tax return data that do not reflect the adjustments.  In addition, the national estimates include adjustments made to reflect decreases in monetary and imputed income that result from damage to fixed capital and to inventories that is caused by disasters, such as hurricanes, floods, and earthquakes. These adjustments are attributed to states and counties on the basis of information from the Federal Emergency Management Agency.
	- Per_capita_personal_income_dollars: The personal income of a given area divided by the resident population of the area. See "personal income."
	- Personal_income_thousands_of_dollars: Consists of the income that persons receive in return for their provision of labor, land, and capital used in current production as well as other income, such as personal current transfer receipts. In the state and local personal income accounts the personal income of an area represents the income received by or on behalf of the persons residing in that area. It is calculated as the sum of wages and salaries, supplements to wages and salaries, proprietors' income with inventory valuation (IVA) and capital consumption adjustments (CCAdj), rental income of persons with capital consumption adjustment (CCAdj), personal dividend income, personal interest income, and personal current transfer receipts, less contributions for government social insurance plus the adjustment for residence.
	- Plus_Adjustment_for_residence: An adjustment made to those components of earnings and employee contributions to social insurance programs (income subject to adjustment) that are reported on a place-of-work basis to convert them to a place-of-residence basis reflecting the net flow of income of interarea commuters. For example, the source data for wages and salaries represent the wages paid by the establishments located in an area. The wages and salaries that the establishments of a given area pay to workers who live outside that area are treated as an outflow and the wages and salaries that the residents of that area receive from establishments located outside that area are treated as an inflow. The adjustment for residence for an area, then, is the net of the inflows to that area and the outflows from that area.
	- Plus_Dividends_interest_and_rent: Consists of personal dividend income, personal interest income, and rental income of persons with capital consumption adjustment (CCAdj).
	- Plus_Personal_current_transfer_receipts: Receipts of persons from government and business for which no current services are performed. Current transfer receipts from government include Social Security benefits, medical benefits, veterans' benefits, and unemployment insurance benefits. Current transfer receipts from business include liability payments for personal injury and corporate gifts to nonprofit institutions.	
	- Proprietors_employment: Consists of farm proprietors employment and nonfarm proprietors employment.
	- Proprietors_income
	- Supplements_to_wages_and_salaries: Consists of employer contributions for government social insurance and employer contributions for employee pension and insurance funds.
	- Wage_and_salary_employment: Wage and salary employment, also referred to as wage and salary jobs, measures the average annual number of full-time and part-time jobs in each area by place of work. All jobs for which wages and salaries are paid are counted. Although compensation paid to jurors, expert legal witnesses, prisoners, and justices of the peace (for marriage fees), is counted in wages and salaries, these activities are not counted as jobs in wage and salary employment. Corporate directorships are counted as self-employment. The following description of the sources and methods used in estimating wage and salary employment is divided into two sections: Employment in industries covered by unemployment insurance (UI) programs, and employment in industries not covered by UI.
	- Wages_and_salaries: The remuneration receivable by employees (including corporate officers) from employers for the provision of labor services. It includes commissions, tips, and bonuses; employee gains from exercising stock options; and pay-in-kind. Judicial fees paid to jurors and witnesses are classified as wages and salaries. Wages and salaries are measured before deductions, such as social security contributions, union dues, and voluntary employee contributions to defined contribution pension plans.	
	- Applications: Number of business applications in a year in a county
	- Award_Amount: Amount of federal funding county received for the CDBG
	
- Dependent:
	- women_involved
	- GDP_by_labor_force

## First attmept -> expanded and separated -> melted into rows but made them over represented in data

In [36]:
#patents_full[['ipc1', 'ipc2', 'ipc3', 'ipc4', 'ipc5', 'ipc6', 'ipc7', 'ipc8']] = patents_full.ipc.str.split(' ', expand = True)
#patents_full

In [35]:
#patents_full.describe(include = 'all')


In [25]:
#cols_to_keep = [name  for name in patents_full.columns if name not in ['ipc', 'ipc1', 'ipc2', 'ipc3', 'ipc4', 'ipc5','ipc6', 'ipc7', 'ipc8']]
#cols_to_keep

In [37]:
# patents_clean = pd.melt(patents_full, id_vars = cols_to_keep+['ipc'], 
#         value_vars = ['ipc1', 'ipc2', 'ipc3', 'ipc4'])
# patents_clean

In [38]:

# patents_with_ipc = patents_clean.query("(ipc.notnull() & value.notnull())", engine = 'python')
# patents_with_ipc

In [39]:
#patents_with_ipc.drop_duplicates(inplace = True)

In [40]:
#patents_with_ipc.shape

In [30]:
#patents_with_ipc.query(" patent_num == '10387698'")

In [41]:
#patents_with_ipc.value.value_counts(dropna = False)

In [42]:
# no_ipcs = patents_clean.query("ipc.isnull()", engine = 'python')\
#              .groupby(['patent_num', 'assignee', 'app_yr', 'app_num', 'ipc'], dropna = False, as_index = False).first()
# no_ipcs

In [43]:
# patents_clean = pd.concat([patents_with_ipc, no_ipcs])
# patents_clean

In [44]:
# #drop ipc & variable
# patents_clean = patents_clean.drop(['ipc', 'variable'], axis = 1)
# #rename value
# patents_clean = patents_clean.rename({'value':'IPC_type'}, axis=1)
# patents_clean

In [35]:
#patents_full = patents_clean.reset_index(drop = True)

## Second try 

- will make separate columns as boolean for each of the categories
- will also had column for number of ipc sections a patent has

In [48]:
#possible values for ipc
ipc_list = 'A B C D E F G H'.split(' ')
ipc_list

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']

In [None]:
# add columns first -> if ipc is null then columns are null
# is ipc not null then 0 / 1 depending if letter in included 

In [65]:
ipcs = patents_full.query("ipc.notnull()", engine = 'python')
ipcs.shape

(945194, 101)

In [72]:
ipcs = ipcs.assign( ipc_A = ipcs.ipc.str.contains('A').astype('int'),
                    ipc_B = ipcs.ipc.str.contains('B').astype('int'),
                    ipc_C = ipcs.ipc.str.contains('C').astype('int'),
                    ipc_D = ipcs.ipc.str.contains('D').astype('int'),
                    ipc_E = ipcs.ipc.str.contains('E').astype('int'),
                    ipc_F = ipcs.ipc.str.contains('F').astype('int'), 
                    ipc_G = ipcs.ipc.str.contains('G').astype('int'),
                    ipc_H = ipcs.ipc.str.contains('H').astype('int'))
ipcs

Unnamed: 0,patent_num,assignee,grant_yr,app_yr,app_num,GEOID,ipc,team_size,men_inventors,women_inventors,already_granted,assignee_univ_map,Institution,Control,r1,r2,special_focus,Over25_Less9Grade,Over25_SomeHS,Over25_HSGrad,Over25_SomeCollege,Over25_Assosc,Over25_Bach,Over25_Grad,bea_region,Agriculture_Forestry_Fishing_Hunting,Mining_Quarrying_and_Oil_Gas_Extraction,Utilities,Construction,Manufacturing,Wholesale_Trade,Retail_Trade,Transportation_Warehousing,Information,Finance_Insurance,Real_Estate_Rental_Leasing,Professional_Scientific_and_Technical_Services,Management_of_Companies_Enterprises,Administrative_Support_Waste_Management_Remediation_Services,Educational_Services,Health_Care_Social_Assistance,Arts_Entertainment_and_Recreation,Accommodation_Food_Services,Other_Services_except_Public_Administration,qp1,ap,est,Agriculture_Forestry_Fishing_Hunting_base,Mining_Quarrying_and_Oil_Gas_Extraction_base,Utilities_base,Construction_base,Manufacturing_base,Wholesale_Trade_base,Retail_Trade_base,Transportation_Warehousing_base,Information_base,Finance_Insurance_base,Real_Estate_Rental_Leasing_base,Professional_Scientific_and_Technical_Services_base,Management_of_Companies_Enterprises_base,Administrative_Support_Waste_Management_Remediation_Services_base,Educational_Services_base,Health_Care_Social_Assistance_base,Arts_Entertainment_and_Recreation_base,Accommodation_Food_Services_base,Other_Services_except_Public_Administration_base,GDP,pop_gt_16,pop_gt_16_lf,pop_gt_16_lf_c,Pop_Est,Earnings_by_place_of_work,Employee_and_self-employed_contributions_for_government_social_insurance,Employer_contributions_for_employee_pension_and_insurance_funds,Employer_contributions_for_government_social_insurance,Equals_Net_earnings_by_place_of_residence,Farm_income,Farm_proprietors_income,Less_Contributions_for_government_social_insurance,Nonfarm_personal_income,Nonfarm_proprietors_income,Per_capita_personal_income_dollars,Personal_income_thousands_of_dollars,Plus_Adjustment_for_residence,Plus_Dividends_interest_and_rent,Plus_Personal_current_transfer_receipts,Population_persons,Proprietors_employment,Proprietors_income,Supplements_to_wages_and_salaries,Total_employment,Wage_and_salary_employment,Wages_and_salaries,Population_Estimate,Applications,Award_Amount,women_involved,GDP_by_labor_force,ap_by_est,est_by_pop_gt_16_lf,diff,ipc_A,ipc_B,ipc_C,ipc_D,ipc_E,ipc_F,ipc_G,ipc_H
0,7834652,LATTICE SEMICONDUCTOR CORPORATION,2010.0,2010,12709685,41067,H,3,2,1,1,lattice semiconductor corporation,,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,1,124.452907,716.222120,0.050116,-14973,0,0,0,0,0,0,0,1
43,7862449,"NIKE, INC.",2011.0,2010,12775718,41067,A,2,2,0,1,"nike, inc.",,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,1,0,0,0,0,0,0,0
44,7864541,RADISYS CORPORATION,2011.0,2010,12686255,41067,H,4,4,0,1,radisys corporation,,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,0,0,0,0,0,0,0,1
45,7867104,"NIKE, INC.",2011.0,2010,12652592,41067,A,3,3,0,1,"nike, inc.",,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,1,0,0,0,0,0,0,0
46,7868646,LATTICE SEMICONDUCTOR CORPORATION,2011.0,2010,12818544,41067,H,2,2,0,1,lattice semiconductor corporation,,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1715688,8813686,"DAKOTA FISHERIES, INC.",2014.0,2010,12791793,46055,A,1,1,0,1,"dakota fisheries, inc.",,,0,0,0,3.6,3.9,47.2,18.0,7.3,16.1,3.9,Plains,0.000000,0.00000,0.0,0.000000,0.000000,3.172204,1.236753,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.568177,0.000000,1392,6162,86,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,74849,1566,1017,1017,1886,48157,2346,4395,1816,43194,11740,9591,4162,61974,9035,38194,73714,-801,18321,12199,1930,672,18626,6211,1528,856,23320,1931,13,,0,73.597837,71.651163,0.084562,-45,1,0,0,0,0,0,0,0
1715690,9045205,"GLOBAL POLYMER INDUSTRIES, INC.",2015.0,2013,13829492,46079,B E,2,1,1,1,"global polymer industries, inc.",,,0,0,0,4.1,4.1,39.8,19.8,7.4,15.9,8.9,Plains,0.000000,0.00000,0.0,1.074330,2.479011,1.478441,1.059347,0.000000,0.768365,0.644363,0.000000,0.000000,0.0,0.311885,0.000000,1.087989,0.240654,1.053294,0.932061,25640,109071,363,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,558401,9402,6563,6563,11564,365389,19316,32733,13348,357249,67842,65061,32664,540075,77965,50995,607917,24524,153377,97291,11921,2381,143026,46081,7776,5395,176282,11910,266,,1,85.083194,300.471074,0.055310,-346,0,1,0,0,1,0,0,0
1715691,9051134,"DAKOTA ETHANOL, L.L.C.",2015.0,2011,13004065,46079,B,1,1,0,1,"dakota ethanol, l.l.c.",,,0,0,0,5.6,5.1,37.7,19.6,7.0,14.8,10.1,Plains,0.000000,0.00000,0.0,0.892775,2.271091,1.139140,1.124762,0.536272,0.737102,0.608596,0.462769,0.584411,0.0,0.133011,0.000000,1.217579,0.328162,1.194534,0.888004,21228,95874,348,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,524655,9102,6156,6156,11193,355581,14872,31771,12163,359014,73484,71252,27035,491824,83497,48889,565308,30468,118531,87763,11563,2193,154749,43934,7245,5052,156898,11559,85,,0,85.226608,275.500000,0.056530,-366,0,1,0,0,0,0,0,0
1715694,9902575,JEROME I. MACK,2018.0,2015,14965306,46089,A B,2,2,0,1,jerome i. mack,,,0,0,0,14.3,6.3,30.2,24.6,9.2,13.1,2.3,Plains,0.000000,0.00000,0.0,0.706532,1.941441,2.364646,1.476327,0.000000,0.000000,2.660907,0.000000,0.000000,0.0,0.000000,0.000000,0.781450,0.439132,0.420646,2.055536,2156,8800,81,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,137410,1917,1090,1090,2263,65775,2521,3850,1413,65633,34587,32753,3934,76729,8563,46401,111316,3792,26057,19626,2399,699,41316,5263,1351,652,19196,2397,5,,0,126.064220,108.641975,0.074312,-134,1,1,0,0,0,0,0,0


In [94]:
# added the number of ipc sections in case that's meaningful? 
ipcs['num_ipcs']  = [(len(x)+1)/2 if len(x) != 1 else 1 for x in ipcs['ipc'] ]
ipcs = ipcs.assign( num_ipcs = ipcs.num_ipcs.astype('int'))
ipcs

Unnamed: 0,patent_num,assignee,grant_yr,app_yr,app_num,GEOID,ipc,team_size,men_inventors,women_inventors,already_granted,assignee_univ_map,Institution,Control,r1,r2,special_focus,Over25_Less9Grade,Over25_SomeHS,Over25_HSGrad,Over25_SomeCollege,Over25_Assosc,Over25_Bach,Over25_Grad,bea_region,Agriculture_Forestry_Fishing_Hunting,Mining_Quarrying_and_Oil_Gas_Extraction,Utilities,Construction,Manufacturing,Wholesale_Trade,Retail_Trade,Transportation_Warehousing,Information,Finance_Insurance,Real_Estate_Rental_Leasing,Professional_Scientific_and_Technical_Services,Management_of_Companies_Enterprises,Administrative_Support_Waste_Management_Remediation_Services,Educational_Services,Health_Care_Social_Assistance,Arts_Entertainment_and_Recreation,Accommodation_Food_Services,Other_Services_except_Public_Administration,qp1,ap,est,Agriculture_Forestry_Fishing_Hunting_base,Mining_Quarrying_and_Oil_Gas_Extraction_base,Utilities_base,Construction_base,Manufacturing_base,Wholesale_Trade_base,Retail_Trade_base,Transportation_Warehousing_base,Information_base,Finance_Insurance_base,Real_Estate_Rental_Leasing_base,Professional_Scientific_and_Technical_Services_base,Management_of_Companies_Enterprises_base,Administrative_Support_Waste_Management_Remediation_Services_base,Educational_Services_base,Health_Care_Social_Assistance_base,Arts_Entertainment_and_Recreation_base,Accommodation_Food_Services_base,Other_Services_except_Public_Administration_base,GDP,pop_gt_16,pop_gt_16_lf,pop_gt_16_lf_c,Pop_Est,Earnings_by_place_of_work,Employee_and_self-employed_contributions_for_government_social_insurance,Employer_contributions_for_employee_pension_and_insurance_funds,Employer_contributions_for_government_social_insurance,Equals_Net_earnings_by_place_of_residence,Farm_income,Farm_proprietors_income,Less_Contributions_for_government_social_insurance,Nonfarm_personal_income,Nonfarm_proprietors_income,Per_capita_personal_income_dollars,Personal_income_thousands_of_dollars,Plus_Adjustment_for_residence,Plus_Dividends_interest_and_rent,Plus_Personal_current_transfer_receipts,Population_persons,Proprietors_employment,Proprietors_income,Supplements_to_wages_and_salaries,Total_employment,Wage_and_salary_employment,Wages_and_salaries,Population_Estimate,Applications,Award_Amount,women_involved,GDP_by_labor_force,ap_by_est,est_by_pop_gt_16_lf,diff,ipc_A,ipc_B,ipc_C,ipc_D,ipc_E,ipc_F,ipc_G,ipc_H,num_ipcs
0,7834652,LATTICE SEMICONDUCTOR CORPORATION,2010.0,2010,12709685,41067,H,3,2,1,1,lattice semiconductor corporation,,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,1,124.452907,716.222120,0.050116,-14973,0,0,0,0,0,0,0,1,1
43,7862449,"NIKE, INC.",2011.0,2010,12775718,41067,A,2,2,0,1,"nike, inc.",,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,1,0,0,0,0,0,0,0,1
44,7864541,RADISYS CORPORATION,2011.0,2010,12686255,41067,H,4,4,0,1,radisys corporation,,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,0,0,0,0,0,0,0,1,1
45,7867104,"NIKE, INC.",2011.0,2010,12652592,41067,A,3,3,0,1,"nike, inc.",,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,1,0,0,0,0,0,0,0,1
46,7868646,LATTICE SEMICONDUCTOR CORPORATION,2011.0,2010,12818544,41067,H,2,2,0,1,lattice semiconductor corporation,,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1715688,8813686,"DAKOTA FISHERIES, INC.",2014.0,2010,12791793,46055,A,1,1,0,1,"dakota fisheries, inc.",,,0,0,0,3.6,3.9,47.2,18.0,7.3,16.1,3.9,Plains,0.000000,0.00000,0.0,0.000000,0.000000,3.172204,1.236753,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.568177,0.000000,1392,6162,86,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,74849,1566,1017,1017,1886,48157,2346,4395,1816,43194,11740,9591,4162,61974,9035,38194,73714,-801,18321,12199,1930,672,18626,6211,1528,856,23320,1931,13,,0,73.597837,71.651163,0.084562,-45,1,0,0,0,0,0,0,0,1
1715690,9045205,"GLOBAL POLYMER INDUSTRIES, INC.",2015.0,2013,13829492,46079,B E,2,1,1,1,"global polymer industries, inc.",,,0,0,0,4.1,4.1,39.8,19.8,7.4,15.9,8.9,Plains,0.000000,0.00000,0.0,1.074330,2.479011,1.478441,1.059347,0.000000,0.768365,0.644363,0.000000,0.000000,0.0,0.311885,0.000000,1.087989,0.240654,1.053294,0.932061,25640,109071,363,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,558401,9402,6563,6563,11564,365389,19316,32733,13348,357249,67842,65061,32664,540075,77965,50995,607917,24524,153377,97291,11921,2381,143026,46081,7776,5395,176282,11910,266,,1,85.083194,300.471074,0.055310,-346,0,1,0,0,1,0,0,0,2
1715691,9051134,"DAKOTA ETHANOL, L.L.C.",2015.0,2011,13004065,46079,B,1,1,0,1,"dakota ethanol, l.l.c.",,,0,0,0,5.6,5.1,37.7,19.6,7.0,14.8,10.1,Plains,0.000000,0.00000,0.0,0.892775,2.271091,1.139140,1.124762,0.536272,0.737102,0.608596,0.462769,0.584411,0.0,0.133011,0.000000,1.217579,0.328162,1.194534,0.888004,21228,95874,348,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,524655,9102,6156,6156,11193,355581,14872,31771,12163,359014,73484,71252,27035,491824,83497,48889,565308,30468,118531,87763,11563,2193,154749,43934,7245,5052,156898,11559,85,,0,85.226608,275.500000,0.056530,-366,0,1,0,0,0,0,0,0,1
1715694,9902575,JEROME I. MACK,2018.0,2015,14965306,46089,A B,2,2,0,1,jerome i. mack,,,0,0,0,14.3,6.3,30.2,24.6,9.2,13.1,2.3,Plains,0.000000,0.00000,0.0,0.706532,1.941441,2.364646,1.476327,0.000000,0.000000,2.660907,0.000000,0.000000,0.0,0.000000,0.000000,0.781450,0.439132,0.420646,2.055536,2156,8800,81,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,137410,1917,1090,1090,2263,65775,2521,3850,1413,65633,34587,32753,3934,76729,8563,46401,111316,3792,26057,19626,2399,699,41316,5263,1351,652,19196,2397,5,,0,126.064220,108.641975,0.074312,-134,1,1,0,0,0,0,0,0,2


In [95]:
no_ipcs = patents_full.query("ipc.isnull()", engine = 'python')


In [96]:
patents_ipc_cols = pd.concat([ipcs,no_ipcs]).reset_index(drop = True)
patents_ipc_cols

Unnamed: 0,patent_num,assignee,grant_yr,app_yr,app_num,GEOID,ipc,team_size,men_inventors,women_inventors,already_granted,assignee_univ_map,Institution,Control,r1,r2,special_focus,Over25_Less9Grade,Over25_SomeHS,Over25_HSGrad,Over25_SomeCollege,Over25_Assosc,Over25_Bach,Over25_Grad,bea_region,Agriculture_Forestry_Fishing_Hunting,Mining_Quarrying_and_Oil_Gas_Extraction,Utilities,Construction,Manufacturing,Wholesale_Trade,Retail_Trade,Transportation_Warehousing,Information,Finance_Insurance,Real_Estate_Rental_Leasing,Professional_Scientific_and_Technical_Services,Management_of_Companies_Enterprises,Administrative_Support_Waste_Management_Remediation_Services,Educational_Services,Health_Care_Social_Assistance,Arts_Entertainment_and_Recreation,Accommodation_Food_Services,Other_Services_except_Public_Administration,qp1,ap,est,Agriculture_Forestry_Fishing_Hunting_base,Mining_Quarrying_and_Oil_Gas_Extraction_base,Utilities_base,Construction_base,Manufacturing_base,Wholesale_Trade_base,Retail_Trade_base,Transportation_Warehousing_base,Information_base,Finance_Insurance_base,Real_Estate_Rental_Leasing_base,Professional_Scientific_and_Technical_Services_base,Management_of_Companies_Enterprises_base,Administrative_Support_Waste_Management_Remediation_Services_base,Educational_Services_base,Health_Care_Social_Assistance_base,Arts_Entertainment_and_Recreation_base,Accommodation_Food_Services_base,Other_Services_except_Public_Administration_base,GDP,pop_gt_16,pop_gt_16_lf,pop_gt_16_lf_c,Pop_Est,Earnings_by_place_of_work,Employee_and_self-employed_contributions_for_government_social_insurance,Employer_contributions_for_employee_pension_and_insurance_funds,Employer_contributions_for_government_social_insurance,Equals_Net_earnings_by_place_of_residence,Farm_income,Farm_proprietors_income,Less_Contributions_for_government_social_insurance,Nonfarm_personal_income,Nonfarm_proprietors_income,Per_capita_personal_income_dollars,Personal_income_thousands_of_dollars,Plus_Adjustment_for_residence,Plus_Dividends_interest_and_rent,Plus_Personal_current_transfer_receipts,Population_persons,Proprietors_employment,Proprietors_income,Supplements_to_wages_and_salaries,Total_employment,Wage_and_salary_employment,Wages_and_salaries,Population_Estimate,Applications,Award_Amount,women_involved,GDP_by_labor_force,ap_by_est,est_by_pop_gt_16_lf,diff,ipc_A,ipc_B,ipc_C,ipc_D,ipc_E,ipc_F,ipc_G,ipc_H,num_ipcs
0,7834652,LATTICE SEMICONDUCTOR CORPORATION,2010.0,2010,12709685,41067,H,3,2,1,1,lattice semiconductor corporation,,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.076330,0.000000,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.000000,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,1,124.452907,716.222120,0.050116,-14973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,7862449,"NIKE, INC.",2011.0,2010,12775718,41067,A,2,2,0,1,"nike, inc.",,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.076330,0.000000,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.000000,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,7864541,RADISYS CORPORATION,2011.0,2010,12686255,41067,H,4,4,0,1,radisys corporation,,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.076330,0.000000,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.000000,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,7867104,"NIKE, INC.",2011.0,2010,12652592,41067,A,3,3,0,1,"nike, inc.",,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.076330,0.000000,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.000000,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,7868646,LATTICE SEMICONDUCTOR CORPORATION,2011.0,2010,12818544,41067,H,2,2,0,1,lattice semiconductor corporation,,,0,0,0,5.0,5.3,17.6,22.5,8.3,26.1,15.2,Far West,1.128043,0.076330,0.000000,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.000000,0.905091,0.900785,0.701960,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,282343,281948,516665,17966052,1084692,1789577,1130559,15547889,101805,21372,2215251,21929695,1449535,41440,22031500,-202912,3414377,3069234,531652,64061,1470907,2920136,313441,249380,13575009,531638,4470,3558868.0,0,124.452907,716.222120,0.050116,-14973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1715714,,"HFW SOLUTIONS, INC.",,2019,16249413,46103,,3,2,0,0,"hfw solutions, inc.",,,0,0,0,1.3,5.1,27.5,23.9,10.6,20.7,10.8,Plains,0.335158,1.047255,3.764740,1.470535,0.618523,0.841453,1.454527,0.542038,0.786652,1.032308,0.828517,0.550699,0.333446,0.461596,0.554630,1.314346,1.031802,1.292662,1.175905,495501,2016745,3770,0,1,1,1,0,0,1,0,0,1,0,0,0,0,0,1,1,1,1,5861546,87474,57792,56885,110685,4111774,259465,510579,211576,3300267,-1376,-4602,471041,6169455,544812,53913,6168079,-340466,1726459,1141353,114407,16208,540210,722155,80521,64313,2849409,113775,1180,509455.0,0,101.424868,534.945623,0.065234,-3090,,,,,,,,,
1715715,,Parwan Electronics Corporation,,2019,16730499,46013,,1,0,0,0,parwan electronics corporation,,,0,0,0,3.0,4.0,31.6,20.2,11.8,21.1,8.3,Plains,0.706049,0.000000,1.199586,0.749512,2.197832,1.415697,1.265150,0.520982,0.514309,1.056086,0.846099,0.301134,0.457576,0.265964,0.926628,1.101582,1.049688,1.086369,0.948585,185589,767226,1251,0,0,1,0,1,1,1,0,0,1,0,0,0,0,0,1,1,1,0,2226958,30911,21466,21433,38915,1512983,91289,177379,71029,1304811,35345,23702,162318,2134172,271627,55796,2169517,-45854,556186,308520,38883,6487,295329,248408,28624,22137,969246,38839,226,,0,103.743501,613.290168,0.058278,76,,,,,,,,,
1715716,,"NOVITA NUTRITION, LLC",,2019,17285460,46011,,4,3,0,0,"novita nutrition, llc",,,0,0,0,1.4,3.6,25.2,19.2,10.2,26.4,14.0,Plains,0.000000,0.240786,0.911292,0.712355,3.728416,0.532536,1.103460,0.448192,0.427719,1.019014,1.039964,0.611556,0.000000,0.132055,0.203060,0.754466,0.809712,1.171469,0.995348,147218,580400,892,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,2130129,28284,20224,20194,34601,1283789,72321,186471,64291,1041978,72782,57569,136612,1581264,98361,46641,1654046,-105199,410107,201961,35463,4197,155930,250762,24920,20723,877097,35077,230,,0,105.326790,650.672646,0.044106,-476,,,,,,,,,
1715717,,NETSEC CONCEPTS LLC,,2019,16362274,46093,,3,2,1,0,netsec concepts llc,,,0,0,0,2.0,3.6,28.7,24.1,14.4,18.9,8.3,Plains,0.000000,0.350217,0.836127,2.226278,0.479316,0.568159,1.327167,2.543280,0.191155,0.561963,0.914501,0.449179,0.000000,0.243072,0.089644,1.694865,0.516121,0.969314,1.354125,61525,266743,703,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,857782,21868,15525,14529,27717,579402,40466,88821,28428,775577,-26348,-30172,68894,1306884,121208,44980,1280536,265069,269932,235027,28469,4731,91036,117249,13378,8647,371117,28332,225,,1,55.251659,379.435277,0.045282,-615,,,,,,,,,


In [108]:
patents_full = patents_ipc_cols

## Trent's stuff starts here

- separating variables by type 

In [109]:
categorical = [
'ipc',
'ipc_A',
'ipc_B',
'ipc_C',
'ipc_D',
'ipc_E',
'ipc_F',
'ipc_G',
'ipc_H',
'already_granted',
'Control',
'r1',
'r2',
'special_focus',
'bea_region',
'Agriculture_Forestry_Fishing_Hunting_base',
'Mining_Quarrying_and_Oil_Gas_Extraction_base',
'Utilities_base',
'Construction_base',
'Manufacturing_base',
'Wholesale_Trade_base',
'Retail_Trade_base',
'Transportation_Warehousing_base',
'Information_base',
'Finance_Insurance_base',
'Real_Estate_Rental_Leasing_base',
'Professional_Scientific_and_Technical_Services_base',
'Management_of_Companies_Enterprises_base',
'Administrative_Support_Waste_Management_Remediation_Services_base',
'Educational_Services_base',
'Health_Care_Social_Assistance_base',
'Arts_Entertainment_and_Recreation_base',
'Accommodation_Food_Services_base',
'Other_Services_except_Public_Administration_base',
]

numeric = [
 'num_ipcs',
 'team_size',
 'men_inventors',
 'women_inventors',
 'Over25_Less9Grade',
 'Over25_SomeHS',
 'Over25_HSGrad',
 'Over25_SomeCollege',
 'Over25_Assosc',
 'Over25_Bach',
 'Over25_Grad',
 'ap_by_est',
 'pop_gt_16_lf',
 'est_by_pop_gt_16_lf',
 'Total_employment',
 'Pop_Est',
 'Population_Estimate',
 'Population_persons',
 'Earnings_by_place_of_work',
 'Employee_and_self-employed_contributions_for_government_social_insurance',
 'Employer_contributions_for_employee_pension_and_insurance_funds',
 'Employer_contributions_for_government_social_insurance',
 'Equals_Net_earnings_by_place_of_residence',
 'Farm_income',
 'Farm_proprietors_income',
 'Less_Contributions_for_government_social_insurance',
 'Nonfarm_personal_income',
 'Nonfarm_proprietors_income',
 'Per_capita_personal_income_dollars',
 'Personal_income_thousands_of_dollars',
 'Plus_Adjustment_for_residence',
 'Plus_Dividends_interest_and_rent',
 'Plus_Personal_current_transfer_receipts',	
 'Proprietors_employment',
 'Proprietors_income',
 'Supplements_to_wages_and_salaries',
 'Wage_and_salary_employment',
 'Wages_and_salaries',
 'Applications',
 'Award_Amount',
 'GDP_by_labor_force'
]

## Split Data

- Consider splitting the data into a training and test set randomly, not by time for Logistic Regression.
- Consider grouping by county, year and then randomly splitting data into train/test.

In [110]:
### Train & Validation Data: Application Years 2010-2017
patents_train_val = patents_full.query("app_yr >= 2010 & app_yr <= 2017")
patents_train_val.shape

(1591150, 110)

In [111]:
## Test Data Application Years: 2018-2019

patents_test_val = patents_full.query("app_yr >= 2018 & app_yr <= 2019")
patents_test_val.shape

(124569, 110)

## Scale and Center (Standardize) Train Data in the context of a pipeline.


In [112]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
scaler = StandardScaler()
train = patents_train_val.copy()

In [113]:
features = train[numeric]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
train[numeric] = features

In [114]:
train.head(5)

Unnamed: 0,patent_num,assignee,grant_yr,app_yr,app_num,GEOID,ipc,team_size,men_inventors,women_inventors,already_granted,assignee_univ_map,Institution,Control,r1,r2,special_focus,Over25_Less9Grade,Over25_SomeHS,Over25_HSGrad,Over25_SomeCollege,Over25_Assosc,Over25_Bach,Over25_Grad,bea_region,Agriculture_Forestry_Fishing_Hunting,Mining_Quarrying_and_Oil_Gas_Extraction,Utilities,Construction,Manufacturing,Wholesale_Trade,Retail_Trade,Transportation_Warehousing,Information,Finance_Insurance,Real_Estate_Rental_Leasing,Professional_Scientific_and_Technical_Services,Management_of_Companies_Enterprises,Administrative_Support_Waste_Management_Remediation_Services,Educational_Services,Health_Care_Social_Assistance,Arts_Entertainment_and_Recreation,Accommodation_Food_Services,Other_Services_except_Public_Administration,qp1,ap,est,Agriculture_Forestry_Fishing_Hunting_base,Mining_Quarrying_and_Oil_Gas_Extraction_base,Utilities_base,Construction_base,Manufacturing_base,Wholesale_Trade_base,Retail_Trade_base,Transportation_Warehousing_base,Information_base,Finance_Insurance_base,Real_Estate_Rental_Leasing_base,Professional_Scientific_and_Technical_Services_base,Management_of_Companies_Enterprises_base,Administrative_Support_Waste_Management_Remediation_Services_base,Educational_Services_base,Health_Care_Social_Assistance_base,Arts_Entertainment_and_Recreation_base,Accommodation_Food_Services_base,Other_Services_except_Public_Administration_base,GDP,pop_gt_16,pop_gt_16_lf,pop_gt_16_lf_c,Pop_Est,Earnings_by_place_of_work,Employee_and_self-employed_contributions_for_government_social_insurance,Employer_contributions_for_employee_pension_and_insurance_funds,Employer_contributions_for_government_social_insurance,Equals_Net_earnings_by_place_of_residence,Farm_income,Farm_proprietors_income,Less_Contributions_for_government_social_insurance,Nonfarm_personal_income,Nonfarm_proprietors_income,Per_capita_personal_income_dollars,Personal_income_thousands_of_dollars,Plus_Adjustment_for_residence,Plus_Dividends_interest_and_rent,Plus_Personal_current_transfer_receipts,Population_persons,Proprietors_employment,Proprietors_income,Supplements_to_wages_and_salaries,Total_employment,Wage_and_salary_employment,Wages_and_salaries,Population_Estimate,Applications,Award_Amount,women_involved,GDP_by_labor_force,ap_by_est,est_by_pop_gt_16_lf,diff,ipc_A,ipc_B,ipc_C,ipc_D,ipc_E,ipc_F,ipc_G,ipc_H,num_ipcs
0,7834652,LATTICE SEMICONDUCTOR CORPORATION,2010.0,2010,12709685,41067,H,0.003283,-0.279788,1.034332,1,lattice semiconductor corporation,,,0,0,0,-0.300986,-0.6683,-0.732322,1.081626,0.937232,0.61756,-0.255381,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.70196,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,-0.607708,281948,-0.603084,-0.768141,-0.730388,-0.751363,-0.736896,-0.792005,0.179634,-0.211473,-0.736789,-0.789977,-0.674211,-0.929987,-0.789411,0.367527,-0.85077,-0.606752,-0.604693,-0.624054,-0.675387,-0.75,-0.692295,-0.708948,-0.773832,-0.604407,-0.63909,-0.469308,1,-0.387222,-0.740226,-0.123591,-14973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.64189
1,7862449,"NIKE, INC.",2011.0,2010,12775718,41067,A,-0.481316,-0.279788,-0.479849,1,"nike, inc.",,,0,0,0,-0.300986,-0.6683,-0.732322,1.081626,0.937232,0.61756,-0.255381,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.70196,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,-0.607708,281948,-0.603084,-0.768141,-0.730388,-0.751363,-0.736896,-0.792005,0.179634,-0.211473,-0.736789,-0.789977,-0.674211,-0.929987,-0.789411,0.367527,-0.85077,-0.606752,-0.604693,-0.624054,-0.675387,-0.75,-0.692295,-0.708948,-0.773832,-0.604407,-0.63909,-0.469308,0,-0.387222,-0.740226,-0.123591,-14973,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.64189
2,7864541,RADISYS CORPORATION,2011.0,2010,12686255,41067,H,0.487883,0.830779,-0.479849,1,radisys corporation,,,0,0,0,-0.300986,-0.6683,-0.732322,1.081626,0.937232,0.61756,-0.255381,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.70196,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,-0.607708,281948,-0.603084,-0.768141,-0.730388,-0.751363,-0.736896,-0.792005,0.179634,-0.211473,-0.736789,-0.789977,-0.674211,-0.929987,-0.789411,0.367527,-0.85077,-0.606752,-0.604693,-0.624054,-0.675387,-0.75,-0.692295,-0.708948,-0.773832,-0.604407,-0.63909,-0.469308,0,-0.387222,-0.740226,-0.123591,-14973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.64189
3,7867104,"NIKE, INC.",2011.0,2010,12652592,41067,A,0.003283,0.275496,-0.479849,1,"nike, inc.",,,0,0,0,-0.300986,-0.6683,-0.732322,1.081626,0.937232,0.61756,-0.255381,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.70196,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,-0.607708,281948,-0.603084,-0.768141,-0.730388,-0.751363,-0.736896,-0.792005,0.179634,-0.211473,-0.736789,-0.789977,-0.674211,-0.929987,-0.789411,0.367527,-0.85077,-0.606752,-0.604693,-0.624054,-0.675387,-0.75,-0.692295,-0.708948,-0.773832,-0.604407,-0.63909,-0.469308,0,-0.387222,-0.740226,-0.123591,-14973,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.64189
4,7868646,LATTICE SEMICONDUCTOR CORPORATION,2011.0,2010,12818544,41067,H,-0.481316,-0.279788,-0.479849,1,lattice semiconductor corporation,,,0,0,0,-0.300986,-0.6683,-0.732322,1.081626,0.937232,0.61756,-0.255381,Far West,1.128043,0.07633,0.0,1.026322,1.227994,2.381687,0.975335,0.374296,1.403992,0.902561,1.256738,1.056455,0.0,0.905091,0.900785,0.70196,0.743885,0.729915,0.764135,2574212,10134543,14150,1,0,0,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,35138407,397577,-0.607708,281948,-0.603084,-0.768141,-0.730388,-0.751363,-0.736896,-0.792005,0.179634,-0.211473,-0.736789,-0.789977,-0.674211,-0.929987,-0.789411,0.367527,-0.85077,-0.606752,-0.604693,-0.624054,-0.675387,-0.75,-0.692295,-0.708948,-0.773832,-0.604407,-0.63909,-0.469308,0,-0.387222,-0.740226,-0.123591,-14973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.64189


## Save Data

In [115]:
patents_train_val.to_csv('train_standardized.csv')
patents_test_val.to_csv('patents_full_test.csv')