# Cleaning Merged Patents Data and Split for Model Training/Testing

### Outline:

- Drop redundant columns
- Rename columns
- Add key features
- Clean University Assignment Features
- Data Dictionary
- Split Data
- Save Data

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os



In [2]:
file_location =  r'C:\Users\micha\Documents\GitHub\patents22\data\merged' ##change as necessaru
os.chdir(file_location)

In [3]:
patents_full = pd.read_csv('PATENTS_DATA.csv', dtype = {'GEOID':'str'}, low_memory = False)

In [4]:
patents_full.columns

Index(['patent_number', 'assignee', 'grant_year', 'application_year',
       'application_number', 'GEOID', 'ipc_section', 'team_size', 'inventors',
       'men_inventors', 'women_inventors', 'already_granted',
       'assignee_uni_clean2', 'Institution', 'Control', 'level_r1', 'level_r2',
       'special_focus', 'Perc_Over25_LessNinthGrade',
       'Perc_Over25_SomeHighSchool', 'Perc_Over25_HighSchoolGrad',
       'Perc_Over25_SomeCollege', 'Perc_Over25_Assosciates',
       'Perc_Over25_Bachelors', 'Perc_Over25_Graduate', 'bea_region',
       'Agriculture_Forestry_Fishing_Hunting',
       'Mining_Quarrying_and_Oil_Gas_Extraction', 'Utilities', 'Construction',
       'Manufacturing', 'Wholesale_Trade', 'Retail_Trade',
       'Transportation_Warehousing', 'Information', 'Finance_Insurance',
       'Real_Estate_Rental_Leasing',
       'Professional_Scientific_and_Technical_Services',
       'Management_of_Companies_Enterprises',
       'Administrative_Support_Waste_Management_Remediation

In [5]:
patents_full.dtypes

patent_number                  object
assignee                       object
grant_year                    float64
application_year                int64
application_number              int64
                               ...   
Wage_and_salary_employment    float64
Wages_and_salaries            float64
Population_Estimate           float64
Applications                  float64
Award_Amount                  float64
Length: 98, dtype: object

## Dropping Redundant Columns

In [6]:
patents_full.drop(['year','inventors'], axis = 1, inplace = True)

## Renaming Columns

In [7]:
patents_full.rename(columns = 
                      {'patent_number':'patent_num',
                      'grant_year':'grant_yr',
                      'application_year':'app_yr',
                      'application_number':'app_num',
                      'ipc_section':'ipc',
                      'level_r1':'r1',
                      'level_r2':'r2',
                      'Perc_Over25_LessNinthGrade':'Over25_Less9Grade',
                      'Perc_Over25_SomeHighSchool':'Over25_SomeHS',
                      'Perc_Over25_HighSchoolGrad':'Over25_HSGrad', 
                      'Perc_Over25_SomeCollege':'Over25_SomeCollege',
                      'Perc_Over25_Assosciates':'Over25_Assosc',
                      'Perc_Over25_Bachelors':'Over25_Bach',
                      'Perc_Over25_Graduate':'Over25_Grad',
                      'assignee_uni_clean2':'assignee_univ_map'}, inplace = True)

## Creating Necessary Features

### Women Involvement in Patent

In [8]:
patents_full['women_involved'] = np.where(patents_full['women_inventors'] > 0, 1, 0)

In [9]:
patents_full['women_involved'].value_counts()

0    1303467
1     412290
Name: women_involved, dtype: int64

### Normalize some columns - THIS IS NOT CENTERING AND SCALING -

#### GDP by labor force

In [10]:
# Fixing an issue with how mhk managed the bea/census enumerations...Should go back and fix in the merges...
patents_full['pop_gt_16_lf'] = np.where(patents_full['GEOID'] == '15005', 87051, patents_full['pop_gt_16_lf'])
patents_full.query("GEOID != '15005'" )['pop_gt_16_lf']
patents_full['GDP_by_labor_force'] = patents_full['GDP']/patents_full['pop_gt_16_lf']

### Annual Payrol by number of establishments

In [11]:
patents_full['ap_by_est'] = patents_full['ap']/patents_full['est']
# patents_full.hist(column = 'ap_by_est', bins = 100)
# Show all columns in pandas
# pd.set_option('display.max_columns', None)
# patents_full.sort_values(by = 'ap_by_est', ascending = False).head(5)

### Establishments by Labor Force > 16

In [28]:
patents_full['est_by_pop_gt_16_lf'] = patents_full['est']/patents_full['pop_gt_16_lf']
# patents_full.hist(column = 'est_by_pop_gt_16_lf', bins = 100)
# patents_full.boxplot(column = 'est_by_pop_gt_16_lf')
# patents_full.plot.scatter(x = 'est', y = 'pop_gt_16_lf')
# patents_full.sort_values(by = 'est_by_pop_gt_16_lf', ascending = False).head(5)
# patents_full[['est_by_pop_gt_16_lf', 'est', 'pop_gt_16_lf', 'GEOID']].sort_values(by = 'est_by_pop_gt_16_lf', ascending = False).head(5)

Unnamed: 0,patent_num,assignee,grant_yr,app_yr,app_num,GEOID,ipc,team_size,men_inventors,women_inventors,already_granted,assignee_univ_map,Institution,Control,r1,r2,special_focus,Over25_Less9Grade,Over25_SomeHS,Over25_HSGrad,Over25_SomeCollege,Over25_Assosc,Over25_Bach,Over25_Grad,bea_region,Agriculture_Forestry_Fishing_Hunting,Mining_Quarrying_and_Oil_Gas_Extraction,Utilities,Construction,Manufacturing,Wholesale_Trade,Retail_Trade,Transportation_Warehousing,Information,Finance_Insurance,Real_Estate_Rental_Leasing,Professional_Scientific_and_Technical_Services,Management_of_Companies_Enterprises,Administrative_Support_Waste_Management_Remediation_Services,Educational_Services,Health_Care_Social_Assistance,Arts_Entertainment_and_Recreation,Accommodation_Food_Services,Other_Services_except_Public_Administration,qp1,ap,est,Agriculture_Forestry_Fishing_Hunting_base,Mining_Quarrying_and_Oil_Gas_Extraction_base,Utilities_base,Construction_base,Manufacturing_base,Wholesale_Trade_base,Retail_Trade_base,Transportation_Warehousing_base,Information_base,Finance_Insurance_base,Real_Estate_Rental_Leasing_base,Professional_Scientific_and_Technical_Services_base,Management_of_Companies_Enterprises_base,Administrative_Support_Waste_Management_Remediation_Services_base,Educational_Services_base,Health_Care_Social_Assistance_base,Arts_Entertainment_and_Recreation_base,Accommodation_Food_Services_base,Other_Services_except_Public_Administration_base,GDP,pop_gt_16,pop_gt_16_lf,pop_gt_16_lf_c,Pop_Est,Earnings_by_place_of_work,Employee_and_self-employed_contributions_for_government_social_insurance,Employer_contributions_for_employee_pension_and_insurance_funds,Employer_contributions_for_government_social_insurance,Equals_Net_earnings_by_place_of_residence,Farm_income,Farm_proprietors_income,Less_Contributions_for_government_social_insurance,Nonfarm_personal_income,Nonfarm_proprietors_income,Per_capita_personal_income_dollars,Personal_income_thousands_of_dollars,Plus_Adjustment_for_residence,Plus_Dividends_interest_and_rent,Plus_Personal_current_transfer_receipts,Population_persons,Proprietors_employment,Proprietors_income,Supplements_to_wages_and_salaries,Total_employment,Wage_and_salary_employment,Wages_and_salaries,Population_Estimate,Applications,Award_Amount,women_involved,GDP_by_labor_force,ap_by_est,est_by_pop_gt_16_lf
1129670,,"Wisconsin Film & Bag, Inc.",,2013,14063045,55078,,6,5,0,0,"wisconsin film & bag, inc.",,,,,,4.1,7.9,47.1,17.9,8.9,9.8,4.3,Great Lakes,0.0,0.0,0.0,0.772533,2.270449,1.006049,1.169815,0.708381,0.320765,0.562964,0.405885,0.291603,0.0,0.206873,0.321632,0.964554,0.0,0.999034,1.085172,61244,263268,898,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,158755,3165,1672,1664,4292,118650.0,6227.0,32688.0,5206.0,57630.0,0.0,0.0,11433.0,121452.0,6447.0,27703.0,121452.0,-49587.0,20560.0,43262.0,4384.0,163.0,6447.0,37894.0,2468.0,2305.0,74309.0,4383.0,15.0,,0,94.949163,293.171492,0.537081
1129669,,"Wisconsin Film & Bag, Inc.",,2013,14029951,55078,,5,4,0,0,"wisconsin film & bag, inc.",,,,,,4.1,7.9,47.1,17.9,8.9,9.8,4.3,Great Lakes,0.0,0.0,0.0,0.772533,2.270449,1.006049,1.169815,0.708381,0.320765,0.562964,0.405885,0.291603,0.0,0.206873,0.321632,0.964554,0.0,0.999034,1.085172,61244,263268,898,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,158755,3165,1672,1664,4292,118650.0,6227.0,32688.0,5206.0,57630.0,0.0,0.0,11433.0,121452.0,6447.0,27703.0,121452.0,-49587.0,20560.0,43262.0,4384.0,163.0,6447.0,37894.0,2468.0,2305.0,74309.0,4383.0,15.0,,0,94.949163,293.171492,0.537081
1129668,9398941.0,MOFA GROUP LLC,2016.0,2013,13911618,55078,A B,1,1,0,1,mofa group llc,,,,,,4.1,7.9,47.1,17.9,8.9,9.8,4.3,Great Lakes,0.0,0.0,0.0,0.772533,2.270449,1.006049,1.169815,0.708381,0.320765,0.562964,0.405885,0.291603,0.0,0.206873,0.321632,0.964554,0.0,0.999034,1.085172,61244,263268,898,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,158755,3165,1672,1664,4292,118650.0,6227.0,32688.0,5206.0,57630.0,0.0,0.0,11433.0,121452.0,6447.0,27703.0,121452.0,-49587.0,20560.0,43262.0,4384.0,163.0,6447.0,37894.0,2468.0,2305.0,74309.0,4383.0,15.0,,0,94.949163,293.171492,0.537081
1129667,9138749.0,"WISCONSIN FILM & BAG, INC.",2015.0,2013,14063045,55078,B,6,4,1,1,"wisconsin film & bag, inc.",,,,,,4.1,7.9,47.1,17.9,8.9,9.8,4.3,Great Lakes,0.0,0.0,0.0,0.772533,2.270449,1.006049,1.169815,0.708381,0.320765,0.562964,0.405885,0.291603,0.0,0.206873,0.321632,0.964554,0.0,0.999034,1.085172,61244,263268,898,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,158755,3165,1672,1664,4292,118650.0,6227.0,32688.0,5206.0,57630.0,0.0,0.0,11433.0,121452.0,6447.0,27703.0,121452.0,-49587.0,20560.0,43262.0,4384.0,163.0,6447.0,37894.0,2468.0,2305.0,74309.0,4383.0,15.0,,1,94.949163,293.171492,0.537081
1129666,8820666.0,"WISCONSIN FILM & BAG, INC.",2014.0,2013,13796143,55078,B,6,4,1,1,"wisconsin film & bag, inc.",,,,,,4.1,7.9,47.1,17.9,8.9,9.8,4.3,Great Lakes,0.0,0.0,0.0,0.772533,2.270449,1.006049,1.169815,0.708381,0.320765,0.562964,0.405885,0.291603,0.0,0.206873,0.321632,0.964554,0.0,0.999034,1.085172,61244,263268,898,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,158755,3165,1672,1664,4292,118650.0,6227.0,32688.0,5206.0,57630.0,0.0,0.0,11433.0,121452.0,6447.0,27703.0,121452.0,-49587.0,20560.0,43262.0,4384.0,163.0,6447.0,37894.0,2468.0,2305.0,74309.0,4383.0,15.0,,1,94.949163,293.171492,0.537081


## Cleaning University Assignments

Binary classification if research university or not, for three types of research universities

In [None]:
patents_full['r1'].fillna(0, inplace = True)
patents_full['r2'].fillna(0, inplace = True)
patents_full['special_focus'].fillna(0, inplace = True)

In [None]:
patents_full[['r1','r2','special_focus']] = patents_full[['r1','r2','special_focus']].astype('int')

In [None]:
patents_full.columns

In [None]:
patents_full.already_granted

In [39]:
# patents_full[['GEOID', 'app_yr', 'Pop_Est' ,'Population_Estimate', 'Total_employment', 'pop_gt_16_lf']]
# patents_full['diff'] = patents_full['Pop_Est'] - patents_full['Population_Estimate']
patents_full.sort_values(by = 'diff', ascending = False).tail(5)




Unnamed: 0,patent_num,assignee,grant_yr,app_yr,app_num,GEOID,ipc,team_size,men_inventors,women_inventors,already_granted,assignee_univ_map,Institution,Control,r1,r2,special_focus,Over25_Less9Grade,Over25_SomeHS,Over25_HSGrad,Over25_SomeCollege,Over25_Assosc,Over25_Bach,Over25_Grad,bea_region,Agriculture_Forestry_Fishing_Hunting,Mining_Quarrying_and_Oil_Gas_Extraction,Utilities,Construction,Manufacturing,Wholesale_Trade,Retail_Trade,Transportation_Warehousing,Information,Finance_Insurance,Real_Estate_Rental_Leasing,Professional_Scientific_and_Technical_Services,Management_of_Companies_Enterprises,Administrative_Support_Waste_Management_Remediation_Services,Educational_Services,Health_Care_Social_Assistance,Arts_Entertainment_and_Recreation,Accommodation_Food_Services,Other_Services_except_Public_Administration,qp1,ap,est,Agriculture_Forestry_Fishing_Hunting_base,Mining_Quarrying_and_Oil_Gas_Extraction_base,Utilities_base,Construction_base,Manufacturing_base,Wholesale_Trade_base,Retail_Trade_base,Transportation_Warehousing_base,Information_base,Finance_Insurance_base,Real_Estate_Rental_Leasing_base,Professional_Scientific_and_Technical_Services_base,Management_of_Companies_Enterprises_base,Administrative_Support_Waste_Management_Remediation_Services_base,Educational_Services_base,Health_Care_Social_Assistance_base,Arts_Entertainment_and_Recreation_base,Accommodation_Food_Services_base,Other_Services_except_Public_Administration_base,GDP,pop_gt_16,pop_gt_16_lf,pop_gt_16_lf_c,Pop_Est,Earnings_by_place_of_work,Employee_and_self-employed_contributions_for_government_social_insurance,Employer_contributions_for_employee_pension_and_insurance_funds,Employer_contributions_for_government_social_insurance,Equals_Net_earnings_by_place_of_residence,Farm_income,Farm_proprietors_income,Less_Contributions_for_government_social_insurance,Nonfarm_personal_income,Nonfarm_proprietors_income,Per_capita_personal_income_dollars,Personal_income_thousands_of_dollars,Plus_Adjustment_for_residence,Plus_Dividends_interest_and_rent,Plus_Personal_current_transfer_receipts,Population_persons,Proprietors_employment,Proprietors_income,Supplements_to_wages_and_salaries,Total_employment,Wage_and_salary_employment,Wages_and_salaries,Population_Estimate,Applications,Award_Amount,women_involved,GDP_by_labor_force,ap_by_est,est_by_pop_gt_16_lf,diff
1714745,,"ADVENTURE PRODUCTS, INC.",,2012,13419239,15005,,5,5,0,0,"adventure products, inc.",,,,,,4.2,6.0,33.7,24.6,8.2,16.2,7.2,Far West,0.0,0.0,1.37907,0.868287,0.171008,0.39962,1.13788,1.367791,0.36952,0.248807,1.868415,0.355106,0.0,0.72745,0.439712,0.612537,1.78159,3.292359,0.96116,498927,2056053,4343,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,0,7612076,66,87051,54,66,,,,,,,,,,,,,,,,,,,,,,,,,,0,87.443866,473.417684,0.04989,
1714746,8353968.0,"KING FAMILY KINGETICS, LLC",2013.0,2010,12919320,15005,A,2,2,0,1,"king family kingetics, llc",,,,,,4.6,7.0,33.3,22.7,7.7,17.1,7.6,Far West,0.0,0.0,1.367302,0.984683,0.194582,0.432979,1.202,1.394714,0.349856,0.258421,2.045549,0.334334,0.155699,0.825059,0.460276,0.680276,1.804049,3.416506,1.009152,453422,1933215,4332,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,6939000,74,87051,60,74,,,,,,,,,,,,,,,,,,,,,,,,,,0,79.711893,446.26385,0.049764,
1714747,8719353.0,SEASEER RESEARCH AND DEVELOPMENT LLC,2014.0,2010,12807115,15005,G,3,2,0,1,seaseer research and development llc,,,,,,4.6,7.0,33.3,22.7,7.7,17.1,7.6,Far West,0.0,0.0,1.367302,0.984683,0.194582,0.432979,1.202,1.394714,0.349856,0.258421,2.045549,0.334334,0.155699,0.825059,0.460276,0.680276,1.804049,3.416506,1.009152,453422,1933215,4332,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,6939000,74,87051,60,74,,,,,,,,,,,,,,,,,,,,,,,,,,0,79.711893,446.26385,0.049764,
1714748,,SeaSeer Research and Development LLC,,2010,12807115,15005,,3,1,1,0,seaseer research and development llc,,,,,,4.6,7.0,33.3,22.7,7.7,17.1,7.6,Far West,0.0,0.0,1.367302,0.984683,0.194582,0.432979,1.202,1.394714,0.349856,0.258421,2.045549,0.334334,0.155699,0.825059,0.460276,0.680276,1.804049,3.416506,1.009152,453422,1933215,4332,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,6939000,74,87051,60,74,,,,,,,,,,,,,,,,,,,,,,,,,,1,79.711893,446.26385,0.049764,
1714749,10398091.0,"MAUI GREENS, INC.",2019.0,2018,15948797,15005,A B,1,1,0,1,"maui greens, inc.",,,,,,3.6,4.4,31.8,23.7,9.9,18.0,8.6,Far West,0.071111,0.0,1.448131,1.091185,0.154837,0.478044,1.210943,1.424565,0.368122,0.247322,1.944841,0.368181,0.102444,0.769532,0.481855,0.698334,1.913674,3.127558,0.981248,686117,2831617,4749,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,1,1,0,10473501,71,87051,57,75,,,,,,,,,,,,,,,,,,,,,,,,,,0,120.31454,596.255422,0.054554,


## Data Dictionary

- patent_num: Patent Number
- assignee: Assignee
- grant_yr: Grant Year
- app_yr: Application Year
- app_num: Application Number
- GEOID: GEOID
- ipc: International Patent Application
- team_size: Number of inventors
- men_inventors: Number of men inventors for patent
- women_inventors: Number of women inventors for patent
- already_granted: 1/0 classification, 1 is patent has been granted
- assignee_univ_map: If assignee is a resaerch university, this is what official university name it is mapped to
- Institution: If assigne is research university, the name of university
- Control: Public or private university
- r1: 1/0 classification, 1 if assignee is r1 research university: Very high research activity
- r2: 1/0 classification, 1 if assignee is r2 research university: High research activity
- special_focus: 1/0 classification, 1 if research university that only awards degrees in one area
- Over25_Less9Grade: % of GEOID over 25 years with less than 9th grade education
- Over25_SomeHS': % of GEOID over 25 years with some high school education
- Over25_HSGrad': % of GEOID over 25 years with high school diploma or equivalent
- Over25_SomeCollege': % of GEOID over 25 years with some college education
- Over25_Assosc': % of GEOID over 25 years with assosciate's degree
- Over25_Bach': % of GEOID over 25 years with bachelor's degree 
- Over25_Grad': % of GEOID Over 25 years with graduate degree
- bea_region: Bureau of Economic Analysis Region (https://www.icip.iastate.edu/maps/refmaps/bea)
- North American Industrial Classification (NAICS) Code Location Quotient: (https://www.census.gov/programs-surveys/cbp/data/datasets.html)
  - Agriculture_Forestry_Fishing_Hunting
  - Mining_Quarrying_and_Oil_Gas_Extraction
  - Utilities
  - Construction   
  - Manufacturing  
  - Wholesale_Trade  
  - Retail_Trade
  - Transportation_Warehousing
  - Information
  - Finance_Insurance
  - Real_Estate_Rental_Leasing
  - Professional_Scientific_and_Technical_Services
  - Management_of_Companies_Enterprises
  - Administrative_Support_Waste_Management_Remediation_Services
  - Educational_Services
  - Health_Care_Social_Assistance
  - Arts_Entertainment_and_Recreation 
  - Accommodation_Food_Services
  - Other_Services_except_Public_Administration
  - Agriculture_Forestry_Fishing_Hunting_base (1 in base 0 in non-base)
  - Mining_Quarrying_and_Oil_Gas_Extraction_base (1 in base 0 in non-base)
  - Utilities_base (1 in base 0 in non-base)
  - Construction_base (1 in base 0 in non-base)
  - Manufacturing_base (1 in base 0 in non-base)
  - Wholesale_Trade_base (1 in base 0 in non-base)
  - Retail_Trade_base (1 in base 0 in non-base)
  - Transportation_Warehousing_base (1 in base 0 in non-base)
  - Information_base (1 in base 0 in non-base)
  - Finance_Insurance_base (1 in base 0 in non-base)
  - Real_Estate_Rental_Leasing_base (1 in base 0 in non-base)
  - Professional_Scientific_and_Technical_Services_base (1 in base 0 in non-base)
  - Management_of_Companies_Enterprises_base (1 in base 0 in non-base)
  - Administrative_Support_Waste_Management_Remediation_Services_base (1 in base 0 in non-base)
  - Educational_Services_base  (1 in base 0 in non-base)
  - Health_Care_Social_Assistance_base (1 in base 0 in non-base)
  - Arts_Entertainment_and_Recreation_base (1 in base 0 in non-base)
  - Accommodation_Food_Services_base (1 in base 0 in non-base)
  - Other_Services_except_Public_Administration_base (1 in base 0 in non-base)
- Other County Business Patterns Data ((https://www.census.gov/programs-surveys/cbp/data/datasets.html))
  - qp1: County Business Patterns (CBP) 1st Quarter Payroll
  - ap: County Business Patterns (CBP) annual Payroll
  - est: County Business Patterns (CBP) number of establishments
  - ap_by_est: ap/est
  - est_by_pop_gt_16_lf: est/labor force over 16 years
- GDP: Gross Domestic Product (GDP): All industry total in current dollars (Thousands of dollars) (https://apps.bea.gov/itable/iTable.cfm?ReqID=70&step=1&acrdn=5#reqid=70)
- ACS DP03 SELECTED ECONOMIC CHARACTERISTICS (ACS DP03 2010-2022) (https://data.census.gov/cedsci/table?q=labor%20force&t=Employment%20and%20Labor%20Force%20Status&g=0100000US%240500000&tid=ACSDP5Y2020.DP03)
  - pop_gt_16: American Community Survey (ACS) population over 16 years (field:DP03_0001E)
  - pop_gt_16_lf: American Community Survey (ACS) population over 16 years in the labor force (field:DP03_0002E )
  - pop_gt_16_lf_c: American Community Survey (ACS) population over 16 years in the civilian labor force (field:DP03_0003E)
- Pop_Est: American Community Survey (ACS) population estimate (ACS DT5Y 2010-2022 , Field:B01003, Source : https://data.census.gov/cedsci/table?q=population&t=Populations%20and%20People&g=0100000US%240500000&tid=ACSST5Y2020.S0101)
- women_involved: 1/0 classification, 1 if woman is on the team
- Economic Indicators:
  - Earnings_by_place_of_work
  - Employee_and_self-employed_contributions_for_government_social_insurance
  - Employer_contributions_for_employee_pension_and_insurance_funds
  - Employer_contributions_for_government_social_insurance
  - Equals_Net_earnings_by_place_of_residence
  - Farm_income
  - Farm_proprietors_income
  - Less_Contributions_for_government_social_insurance
  - Nonfarm_personal_income
  - Nonfarm_proprietors_income
  - Per_capita_personal_income_dollars
  - Personal_income_thousands_of_dollars
  - Plus_Adjustment_for_residence
  - Plus_Dividends_interest_and_rent
  - Plus_Personal_current_transfer_receipts
  - Population_persons
  - Proprietors_employment
  - Proprietors_income
  - Supplements_to_wages_and_salaries
  - Total_employment
  - Wage_and_salary_employment
  - Wages_and_salaries
  - Population_Estimate
  - Applications
  - Award_Amount

## Split Data

- Consider splitting the data into a training and test set randomly, not by time for Logistic Regression.
- Consider grouping by county, year and then randomly splitting data into train/test.

In [None]:
### Train & Validation Data: Application Years 2010-2017
patents_train_val = patents_full.query("app_yr >= 2010 & app_yr <= 2017")
patents_train_val.shape

In [None]:
## Test Data Application Years: 2018-2019

patents_test_val = patents_full.query("app_yr >= 2018 & app_yr <= 2019")
patents_test_val.shape

## Save Data

In [None]:
patents_train_val.to_csv('patents_full_train.csv')
patents_test_val.to_csv('patents_full_test.csv')

In [None]:
for col in patents_train_val.columns:
    print(col)