## Collect and further clean the dataset for Machine Learning purposes.

In [1]:
# Import Pandas Dependencies.
import pandas as pd
import numpy as np
from pathlib import Path

# Import Matplotlib Dependencies.
import matplotlib.pyplot as plt

# Import Sklearn Dependencies.
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
# Load the CSV file as a Pandas DataFrame and preview the DataFrame.
df = pd.read_csv(Path('Working Resources/Cleaned_geographic_Data.csv'))
df.head()

Unnamed: 0,GeoName,Description,2016,2017,2018,2019,2020
0,Alabama (Metropolitan Portion),Wage and salary employment,1643706,1664641,1686588,1711156,1643053
1,Alabama (Metropolitan Portion),Proprietors employment Farm proprietors employ...,17653,17142,16771,16389,16466
2,Alabama (Metropolitan Portion),Proprietors employmentNonfarm proprietors empl...,423880,432197,447698,442971,428638
3,Alabama (Metropolitan Portion),Farm employment,21352,21534,20490,19416,19872
4,Alabama (Metropolitan Portion),Nonfarm employment Private nonfarm employment ...,8693,7939,7353,7000,6494


In [3]:
# Determine value counts on GeoName column.
df['GeoName'].value_counts()

Iowa (Metropolitan Portion)                 27
Kansas (Metropolitan Portion)               27
Michigan (Metropolitan Portion)             27
Texas (Metropolitan Portion)                27
New Jersey (Metropolitan Portion)           27
                                            ..
Oklahoma (Nonmetropolitan Portion) *        15
South Dakota (Nonmetropolitan Portion) *    14
Idaho (Nonmetropolitan Portion) *           14
Maryland (Nonmetropolitan Portion) *        11
North Dakota (Nonmetropolitan Portion) *    10
Name: GeoName, Length: 98, dtype: int64

In [4]:
# Split GeoName. 
df[['State', 'GeoName']] = df['GeoName'].str.split('(', expand=True)
df.head()

Unnamed: 0,GeoName,Description,2016,2017,2018,2019,2020,State
0,Metropolitan Portion),Wage and salary employment,1643706,1664641,1686588,1711156,1643053,Alabama
1,Metropolitan Portion),Proprietors employment Farm proprietors employ...,17653,17142,16771,16389,16466,Alabama
2,Metropolitan Portion),Proprietors employmentNonfarm proprietors empl...,423880,432197,447698,442971,428638,Alabama
3,Metropolitan Portion),Farm employment,21352,21534,20490,19416,19872,Alabama
4,Metropolitan Portion),Nonfarm employment Private nonfarm employment ...,8693,7939,7353,7000,6494,Alabama


In [5]:
# Reorder of columns in DataFrame. 
df = df[['State', 'GeoName', 'Description', '2016', '2017', '2018', '2019', '2020']]
df

Unnamed: 0,State,GeoName,Description,2016,2017,2018,2019,2020
0,Alabama,Metropolitan Portion),Wage and salary employment,1643706,1664641,1686588,1711156,1643053
1,Alabama,Metropolitan Portion),Proprietors employment Farm proprietors employ...,17653,17142,16771,16389,16466
2,Alabama,Metropolitan Portion),Proprietors employmentNonfarm proprietors empl...,423880,432197,447698,442971,428638
3,Alabama,Metropolitan Portion),Farm employment,21352,21534,20490,19416,19872
4,Alabama,Metropolitan Portion),Nonfarm employment Private nonfarm employment ...,8693,7939,7353,7000,6494
...,...,...,...,...,...,...,...,...
2254,Wyoming,Nonmetropolitan Portion) *,Nonfarm employment Private nonfarm employment ...,12381,12314,12727,12546,11896
2255,Wyoming,Nonmetropolitan Portion) *,Government and government enterprises Federal ...,4166,4120,4101,4155,4285
2256,Wyoming,Nonmetropolitan Portion) *,Government and government enterprises Military,2118,2124,2092,2106,2075
2257,Wyoming,Nonmetropolitan Portion) *,Government and government enterprises State an...,10827,10529,10454,10090,9722


In [6]:
# Value counts of GeoName column.
df['GeoName'].value_counts()

Metropolitan Portion)         1275
Nonmetropolitan Portion) *     984
Name: GeoName, dtype: int64

In [7]:
# Replace values with '0'(Metropolitan Portion) and '1'(Nonmetropolitan Portion).
df['GeoName'].replace(['Metropolitan Portion)', 'Nonmetropolitan Portion) *'], [0, 1], inplace= True)

# Display.
df

Unnamed: 0,State,GeoName,Description,2016,2017,2018,2019,2020
0,Alabama,0,Wage and salary employment,1643706,1664641,1686588,1711156,1643053
1,Alabama,0,Proprietors employment Farm proprietors employ...,17653,17142,16771,16389,16466
2,Alabama,0,Proprietors employmentNonfarm proprietors empl...,423880,432197,447698,442971,428638
3,Alabama,0,Farm employment,21352,21534,20490,19416,19872
4,Alabama,0,Nonfarm employment Private nonfarm employment ...,8693,7939,7353,7000,6494
...,...,...,...,...,...,...,...,...
2254,Wyoming,1,Nonfarm employment Private nonfarm employment ...,12381,12314,12727,12546,11896
2255,Wyoming,1,Government and government enterprises Federal ...,4166,4120,4101,4155,4285
2256,Wyoming,1,Government and government enterprises Military,2118,2124,2092,2106,2075
2257,Wyoming,1,Government and government enterprises State an...,10827,10529,10454,10090,9722


## Determining which variable to predict. Y (dependent) variable and X (independent) variables.

In [8]:
# Separate target values.
y = df['2020']
X = df.drop('2020', axis=1)

In [9]:
# Identify X parameter.
X = pd.get_dummies(data= X, columns=['State', 'Description'])

# Print the shape of X.
print(X.shape)

# Display preprocessed machine learning dataframe to model.
X.head()

(2259, 83)


Unnamed: 0,GeoName,2016,2017,2018,2019,State_Alabama,State_Alaska,State_Arizona,State_Arkansas,State_California,...,Description_Nonfarm employment Private nonfarm employment Other services (except government and government enterprises),"Description_Nonfarm employment Private nonfarm employment Professional, scientific, and technical services",Description_Nonfarm employment Private nonfarm employment Real estate and rental and leasing,Description_Nonfarm employment Private nonfarm employment Retail trade,Description_Nonfarm employment Private nonfarm employment Transportation and warehousing,Description_Nonfarm employment Private nonfarm employment Utilities,Description_Nonfarm employment Private nonfarm employment Wholesale trade,Description_Proprietors employment Farm proprietors employment,Description_Proprietors employmentNonfarm proprietors employment 2/,Description_Wage and salary employment
0,0,1643706,1664641,1686588,1711156,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,17653,17142,16771,16389,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,423880,432197,447698,442971,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,21352,21534,20490,19416,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,8693,7939,7353,7000,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Export converted ML file for reference.
X.to_csv('MachineLearning_GDP_By_State_geographic_data.csv')

In [11]:
# Split our data into training and testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=66)

In [12]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1694, 83)
(565, 83)
(1694,)
(565,)


In [13]:
# Scale testing Data. 
X_train_scaled = StandardScaler().fit(X_train).transform(X_train)

In [14]:
X_train_scaled[:5]

array([[ 1.13134706, -0.27226636, -0.27328396, -0.271182  , -0.26900395,
        -0.147353  , -0.13653208, -0.1409523 , -0.147353  , -0.13875846,
        -0.14943062, -0.13875846, -0.10068341, -0.08794028, -0.13653208,
        -0.1555114 , -0.1409523 , -0.13427154, -0.15350881, -0.14943062,
        -0.14524822, -0.14524822, -0.14524822, -0.14311509, -0.147353  ,
        -0.11203705, -0.14524822, -0.1409523 , -0.1555114 , -0.1409523 ,
        -0.15749091, -0.12964074, -0.13875846, -0.15350881, -0.14524822,
        -0.10650485, -0.13653208, -0.14943062, -0.147353  , -0.12726649,
        -0.14943062, -0.14311509, -0.1409523 , -0.14943062, -0.10930431,
        -0.1409523 , -0.12726649,  6.34957268, -0.15749091, -0.14524822,
        -0.13427154, -0.1555114 , -0.16329932, -0.147353  , -0.14524822,
        -0.13875846, -0.21821789, -0.20606198, -0.20606198, -0.19815096,
        -0.18996217, -0.20134682, -0.19327328, -0.18996217, -0.1997543 ,
        -0.19491074, -0.20292877, -0.13427154, -0.1

In [17]:
# Verify the mean and standard deviation.
print(np.mean(X_train_scaled[:,0]))
print(np.std(X_train_scaled[:,0]))

5.662530656883915e-17
0.9999999999999999
