## Notebook Plan

This notebook runs logistic regressions across the years 2001 - 2012 and outputs plots displaying said regressions.

1. Read in Data
2. Define Functions
3. Plot Outputs

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statistics
import collections
from IPython.display import display

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.graphics.api import abline_plot


from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_predict, GridSearchCV, cross_val_score, train_test_split, KFold
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
%matplotlib inline


## 1. Read in Data
    1. Dataset: regression_ready_df.csv

In [63]:
df_for_regressions = pd.read_csv('data/regression_ready_df.csv')
df_for_regressions.drop(['Unnamed: 0'], axis =1 , inplace = True)

In [64]:
years = list(df_for_regressions.Year.unique())

In [65]:
df_for_regressions.drop(['award_sum', 'company_count', 'creative_employees', 'creative_establishments',
                        'performance_amount', 'performance_count', 'recipient_amount', 'recipient_count',
                        'regular_employees', 'regular_establishments', 'percent_creative_class', 
                        'total_earned_bachelor','total_earned_graduate_degree', 'total_foreign',
                         'total_less_than_bachelor', 'total_native', 'total_population', 'scaled_population'], axis = 1, inplace = True)

In [66]:
df_for_regressions.drop_duplicates(subset=['city_state', 'Year'], keep='last', inplace = True)

## 2. Define Regression and Plot Functions

1.  #### coef_logistic_reg
    
    1. Parameters:
        1. score: 'Score_assigned' or 'Score_invented'
        2. df: Cleaned DataFrame passed from by_year_plots
    
    2. Returns:
        1. List of coefficients to plot by year and feature
    
2. #### by_year_plots
    1. Parameters:
        1. score: 'Score_assigned' or 'Score_invented'
        2. df: df_for_regressions
    2. Returns:
        1. Plots by year and feature


In [67]:
#Reads in scoring method and dataframe, returns list of coefficients to plot

def coef_logistic_reg(score, df):
    score_75_perc = df[score].describe()[-2]
    y = df[score].apply(lambda x: 1 if x > score_75_perc else 0)
    
    #top 25% cities
    df_top_25 = df.loc[df[score] > score_75_perc]
    df_bottom = df.loc[df[score] < score_75_perc]
    

    y = y.fillna(0)
    

    X = df.drop(['Score_assigned', 'Score_invented','city_state', 'Year'], axis = 1)
    
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 3)
    
    #Simple Logistic Score
    clf = LogisticRegression(random_state=0, solver='lbfgs',
                            multi_class='multinomial')
    clf.fit(x_train, y_train)
    return((clf.coef_.tolist())[0])

In [78]:
#Reads in scoring method and dataframe, returns all the plots divided into All cities, Bottom Cities and Top Cities

def by_year_plots(score, df):
    
    #for all cities
    for_coef_ = []
    for year in years:
        temp_df = df.loc[df.Year == year]
        list_to_add = coef_logistic_reg(score, temp_df)
        for_coef_.append(list_to_add)
        
    #for top cities
    for_coef_top = []
    for year in years:
        temp_df = df.loc[df.Year == year]
        score_75_perc = temp_df[score].describe()[-2]
        temp_df = temp_df.loc[temp_df.Score_assigned >= score_75_perc]
        list_to_add = coef_logistic_reg(score, temp_df)
        for_coef_top.append(list_to_add)
        
    #for bottom cities
    for_coef_bottom = []
    for year in years:
        temp_df = df.loc[df.Year == year]
        score_75_perc = temp_df[score].describe()[-2]
        temp_df = temp_df.loc[temp_df.Score_assigned < score_75_perc]
        list_to_add = coef_logistic_reg(score, temp_df)
        for_coef_bottom.append(list_to_add)
        
    coef_df = pd.DataFrame.from_records(for_coef_, columns=['award_mean', 'scaled_perc_creative_class',
       'creative_establishment_ratio', 'company_count_perc',
       'percent_bachelors', 'percent_graduate', 'percent_foreign_born',
        'recipient_mean', 'performance_mean', 'empowerment_zone', 
                         'median_household_income'])
    coef_df['Year'] = years
    coef_df['which_cities'] = "All Cities" #0 for all cities, 1 for top cities, 2 for bottom cities
    
    
    for_coef_top_df = pd.DataFrame.from_records(for_coef_top, columns=['award_mean', 'scaled_perc_creative_class',
       'creative_establishment_ratio', 'company_count_perc',
       'percent_bachelors', 'percent_graduate', 'percent_foreign_born',
        'recipient_mean', 'performance_mean',
       'empowerment_zone', 'median_household_income'])
    for_coef_top_df['Year'] = years
    for_coef_top_df['which_cities'] = "Top Cities" #0 for all cities, 1 for top cities, 2 for bottom cities
    
    
    for_coef_bottom_df = pd.DataFrame.from_records(for_coef_bottom, columns=['award_mean', 'scaled_perc_creative_class',
       'creative_establishment_ratio', 'company_count_perc',
       'percent_bachelors', 'percent_graduate', 'percent_foreign_born',
       'recipient_mean', 'performance_mean',
       'empowerment_zone', 'median_household_income'])
    for_coef_bottom_df['Year'] = years
    for_coef_bottom_df['which_cities'] = "Bottom Cities" #0 for all cities, 1 for top cities, 2 for bottom cities
    
    #combine plots
    coef_df = pd.concat([coef_df, for_coef_bottom_df, for_coef_top_df], ignore_index = True)
    
    coef_df.rename(columns = {'award_mean': 'SBIR Award Mean', 'scaled_perc_creative_class': 'Percent Creative Class',
                             'creative_establishment_ratio': 'Creative Establishment Ratio', 
                              'company_count_perc': 'Company Count Percent', 'percent_bachelors': 'Percent Bachelors',
                             'percent_graduate': 'Percent Graduate', 'percent_foreign_born': 'Percent Foreign Born',
                             'recipient_mean': 'Federal Funding Recipient Mean', 
                              'performance_mean': 'Federal Funding Performance Mean', 
                              'empowerment_zone': 'Empowerment Zone', 
                              'median_household_income': 'Median Household Income', 'which_cities': 'Cities'}, 
                   inplace = True)
    
    df_columns = coef_df.columns[:-1]
    
    for col in df_columns:
        #fig, ax = plt.subplots(figsize=(10, 8))
        try: 
            ax = sns.lmplot(x="Year", y=col, data=coef_df, hue="Cities", col="Cities", aspect=1)
            y_axis = col + " Feature Importance"
            ax.set(xlabel='Year', ylabel=y_axis)
            plt.show()
            filename = 'logistic_regressions_{0}_{1}.png'.format(score, col)
            ax.savefig(filename)
        except ValueError:
            continue
        
    

## 3. Plots

In [80]:
#by_year_plots('Score_assigned', df_for_regressions)

In [74]:
#by_year_plots('Score_invented', df_for_regressions)

## 4. Create Coefficient Values DataFrame

1.  #### get_coef_vals
    
    1. Parameters:
        1. score: 'Score_assigned' or 'Score_invented'
        2. df: df_for_regressions
    
    2. Returns:
        1. A dataframe containing with features as columns for each year

In [33]:
def get_coef_vals(score, df):
        
    #for all cities
    for_coef_ = []
    for year in years:
        temp_df = df.loc[df.Year == year]
        list_to_add = coef_logistic_reg(score, temp_df)
        for_coef_.append(list_to_add)
    
    coef_df = pd.DataFrame.from_records(for_coef_, columns=['award_mean', 'scaled_perc_creative_class',
       'creative_establishment_ratio', 'company_count_perc',
       'percent_bachelors', 'percent_graduate', 'percent_foreign_born',
        'recipient_mean', 'performance_mean', 'empowerment_zone', 'median_household_income'])
    coef_df['Year'] = years
    coef_df['score'] = score
    
    return(coef_df)

In [34]:
coef_vals_sa = get_coef_vals('Score_assigned', df_for_regressions)
coef_vals_si = get_coef_vals('Score_invented', df_for_regressions)
coef_vals_df = pd.concat([coef_vals_sa, coef_vals_si], ignore_index=True)

In [35]:
coef_vals_df.to_csv('regression_results.csv')