In [None]:
# Set style
sns.set_style('whitegrid')

# Create a copy of the dataframe
df_copy = df.copy()

# Sort the RUCC column
df_copy.sort_values(['RUCC'],inplace=True)

#Melt the dataframe based on the four education levels
df_melt = pd.melt(df_copy,id_vars='RUCC',value_vars=['less_than_high_school','high_school_diploma','college/associate_degree',
                                                     'bachelors/higher'], var_name='education_level',value_name='percentage')

# Create the boxplot of the four education level for each RUCC
figure, ax = plt.subplots(figsize=(20,10))
sns.boxplot(ax=ax, x='education_level',y='percentage',hue='RUCC',data=df_melt,fliersize=2,width=0.6)
ax.legend(loc='lower left',bbox_to_anchor=(0.47,0.55),fontsize=16,title='RUCC')
ax.set_ylim(0,80)
ax.set_title('Box plot of the four education levels of counties across the nine rural urban continuum codes (RUCC) in US',
             fontsize = 20, y=1.02)
ax.set_xlabel('education level',fontsize = 20)
ax.set_ylabel('percentage',fontsize = 20)
ax.tick_params(axis='both', labelsize=20)
plt.show()

In [None]:
# In Story Telling section, it was shown that the correlations between poverty and majority of rural urban continuum codes 
# i.e. all except RUCC 1 and 6, are close to each other . Therefore, any of these seven RUCC (2-5,7-9) could be picked as the 
# base to simplify the interpretation. I pick RUCC 9 and remove it

In [None]:
# Build the linear model with OLS command
X2 = sm.add_constant(X) #OLS by default does not have intercept
model = sm.OLS(y,X2).fit()

# Remove predcitors with p-value less than 0.001
X_new = X2.loc[:,model.pvalues[model.pvalues <0.001].index]

# fit the model on the dataset with reduced number of predictors
model = sm.OLS(y,X_new).fit()

# print summary of the model
model.summary()

# Remove predcitors with p-value less than 0.001
X_new = X2.loc[:,model.pvalues[model.pvalues <0.001].index]

# fit the model on the dataset with reduced number of predictors
model = sm.OLS(y,X_new).fit()

# print summary of the model
model.summary()

# Plot residuals vs. fitted values
figure,ax = plt.subplots(figsize=(10,6))
ax.scatter(model.fittedvalues,model.resid)
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals');
ax.set_title('Residuals vs. fitted values',y=1.02)
ax.axis([0,45,-35,35])

# Plot quantile plot
figure,ax = plt.subplots(figsize=(10,6))
stats.probplot(model.resid,plot=ax)
ax.set_ylabel('Residual quantiles')
ax.set_title('Quantile-Quantile plot',y=1.02)
ax.axis([-4,4,-35,35])

# Show plots
plt.show()

In [None]:
# Define the influence array and create a panda peries with the external studentized residuals
infl = model.get_influence()
p = pd.Series(infl.resid_studentized_external)

# Extract the indices of the series for which the absolute value of external studentized residual is more than 3.
indices = p[np.abs(p)>3].index

# Create a dataframe from the original dataframe which only include the indices extracted above
df_outliers = df.iloc[indices]
X_reduced=X_new.iloc[p[np.abs(p)<=3].index]
y_reduced=y.iloc[p[np.abs(p)<=3].index]

In [1]:
# Import DummyRegressor
from sklearn.dummy import DummyRegressor

# Build the dummy classifier
clf = DummyRegressor(strategy = 'mean')

In [None]:
# Create the dataframe for dummy variables representing the nine RUCC and concat it to the original dataframe
ohe = OneHotEncoder()
RUCC_matrix = ohe.fit_transform(df.RUCC.values.reshape(-1,1)).toarray()
RUCC_df = pd.DataFrame(RUCC_matrix,columns = ['RUCC_'+str(i) for i in np.arange(1,10)])
df_concat = pd.concat([df_copy,RUCC_df],axis = 1)

In [None]:
#Order the data frame based on South, Northeast, Midwest, and West.
df.region = pd.Categorical(df.region, categories=["South","Northeast","Midwest","West"],ordered=True)
df.sort_values('region',inplace=True)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df_concat[df_concat['2013 Rural-urban Continuum Code'].isnull()].reset_index(drop=True))

In [6]:
flights = flights.pivot("month", "year", "passengers")
flights.head()

year,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
January,112,115,145,171,196,204,242,284,315,340,360,417
February,118,126,150,180,196,188,233,277,301,318,342,391
March,132,141,178,193,236,235,267,317,356,362,406,419
April,129,135,163,181,235,227,269,313,348,348,396,461
May,121,125,172,183,229,234,270,318,355,363,420,472


In [None]:
print('Cross tabulation: median of more_than_high_school in each region-RUCC pair')
df_copy.pivot_table(index='region',columns='RUCC',values='more_than_high_school',aggfunc='median')

In [None]:
# Group bu RUCC and measure the average
df_group = df.groupby('RUCC').mean().loc[:,['high_school_or_less','poverty','unemployment']]

# Plot the average of less than high school, poverty, and unemployment across the nine RUCC
plt.plot(df_group,'--D')
plt.legend(df_group.columns,loc='lower left',bbox_to_anchor=(1,0))
plt.xlabel('RUCC',fontsize = 20)
plt.ylabel('percentage',fontsize = 20)
plt.title('Average percentage of population with high school diploma or less,\n \
poverty and unemployment across nine RUCC',fontsize = 20)
plt.show()

In [None]:
#Define clustering function
def clustering(X_train,X_test,n_cluster):
    
    # Build the KMeans cluster, fit it to the train set, create the new feature for the train set, 
    # cocatenate it to the features DataFrame, transform it to dummy variables, and remove the last dummy variable 
    kmeans = KMeans(n_clusters=n_cluster,random_state=21)
    kmeans.fit(X_train)
    new_column_train = kmeans.predict(X_train) 
    new_df_train = pd.DataFrame(new_column_train,columns = ['cluster'], index = X_train.index)
    X_train_extended = pd.concat([X_train,new_df_train],axis=1)
    X_train_extended.cluster = X_train_extended.cluster.astype('str')
    X_train_extended = pd.get_dummies(X_train_extended)
    X_train_extended.drop(['cluster_{}'.format(n_cluster-1)],axis=1,inplace=True)
    
    # Build the new feature for the test set, concatenate it to the features DataFrame of the test set
    # transform it to dummy variables, and remove the last dummy variable 
    new_column_test = kmeans.predict(X_test) 
    new_df_test = pd.DataFrame(new_column_test,columns = ['cluster'], index = X_test.index)
    X_test_extended = pd.concat([X_test,new_df_test],axis=1)
    X_test_extended.cluster = X_test_extended.cluster.astype('str')
    X_test_extended = pd.get_dummies(X_test_extended)
    X_test_extended.drop(['cluster_{}'.format(n_cluster-1)],axis=1,inplace=True)
    
    # Return the new features DataFrame for both train and test sets
    return X_train_extended,X_test_extended

In [None]:
def cluster_forest_SearchCV(cluster_numbers,n_estimators,min_samples_leaf,criterion,max_features,X_train,y_train):
    
    # Create KFold with 5 as the number of splits.
    kf = KFold(n_splits = 5)
    # Create a zero array which will hold the score of each split run. Since the number of splits is 5, the first dimention is 5
    a = np.zeros(shape = (5,len(cluster_numbers),len(n_estimators),len(min_samples_leaf),len(criterion),len(max_features)))
    
    t = 0
    # Loop over the splits of the train set  
    for train_index, test_index in kf.split(X_train):
        X_train_kf = X_train.iloc[train_index]
        X_test_kf = X_train.iloc[test_index]
        y_train_kf = y_train.iloc[train_index]
        y_test_kf = y_train.iloc[test_index]
        
        # Loop over number of clusters
        for i,n_cluster in enumerate(cluster_numbers):
            # Cluster each dataset in train set 
            X_train_kf_extended,X_test_kf_extended = clustering(X_train_kf,X_test_kf,n_cluster)
            # Loop over n_estimators
            for j,n_estimator in enumerate(n_estimators):
                # Loop over min_samples_leaf
                for k,min_sample_leaf in enumerate(min_samples_leaf):
                    # Loop over criterion
                    for l,criteria in enumerate(criterion):
                        # Loop over max features
                        for m,max_feature in enumerate (max_features):
                            # Create the random forest and find the score
                            model = RandomForestRegressor(n_estimators = n_estimator, min_samples_leaf = min_sample_leaf,
                                                   criterion = criteria, max_features = max_feature, random_state=21)
                            model.fit(X_train_kf_extended,y_train_kf)
                            score = model.score(X_test_kf_extended,y_test_kf)
                            # Save the score in the array
                            a[t,i,j,k,l,m] = score
        t += 1
     
    # Average the scores 
    average_score = np.mean(a,axis = 0)
    
    
    #Find the highest average score
    max_score = np.amax(average_score)
    
    # Find the cluster_number, n_estimator, and min_sample_leaf corresponding to the best score
    max_score_index = np.unravel_index(average_score.argmax(), average_score.shape)
    cluster_number = cluster_numbers[max_score_index[0]]
    n_estimator = n_estimators[max_score_index[1]]
    min_sample_leaf = min_samples_leaf[max_score_index[2]]
    criteria = criterion[max_score_index[3]]
    max_feature = max_features[max_score_index[4]]
    
    # Print the best score of parameter tuning and best parameters
    print('The best score is %.3f'%max_score)
    print('\nThe best parameters are:')
    print('Number of clusters: ',cluster_number)
    print('n_estimators: ',n_estimator)
    print('min_samples_leaf: ',min_sample_leaf)
    print('criterion: ',criteria)
    print('max_features: ',max_feature)