In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier

In [2]:
fuel_data = pd.read_csv('C:\\Users\\Chukwunonso\\Documents\\Python\\HAMOYE\Week 1\\fuel_ferc1.csv')

In [3]:
fuel_data

Unnamed: 0,record_id,utility_id_ferc1,report_year,plant_name_ferc1,fuel_type_code_pudl,fuel_unit,fuel_qty_burned,fuel_mmbtu_per_unit,fuel_cost_per_unit_burned,fuel_cost_per_unit_delivered,fuel_cost_per_mmbtu
0,f1_fuel_1994_12_1_0_7,1,1994,rockport,coal,ton,5377489.0,16.590,18.59,18.53,1.121
1,f1_fuel_1994_12_1_0_10,1,1994,rockport total plant,coal,ton,10486945.0,16.592,18.58,18.53,1.120
2,f1_fuel_1994_12_2_0_1,2,1994,gorgas,coal,ton,2978683.0,24.130,39.72,38.12,1.650
3,f1_fuel_1994_12_2_0_7,2,1994,barry,coal,ton,3739484.0,23.950,47.21,45.99,1.970
4,f1_fuel_1994_12_2_0_10,2,1994,chickasaw,gas,mcf,40533.0,1.000,2.77,2.77,2.570
...,...,...,...,...,...,...,...,...,...,...,...
29518,f1_fuel_2018_12_12_0_13,12,2018,neil simpson ct #1,gas,mcf,18799.0,1.059,4.78,4.78,9.030
29519,f1_fuel_2018_12_12_1_1,12,2018,cheyenne prairie 58%,gas,mcf,806730.0,1.050,3.65,3.65,6.950
29520,f1_fuel_2018_12_12_1_10,12,2018,lange ct facility,gas,mcf,104554.0,1.060,4.77,4.77,8.990
29521,f1_fuel_2018_12_12_1_13,12,2018,wygen 3 bhp 52%,coal,ton,315945.0,16.108,3.06,14.76,1.110


Extracting the observations with no missing values which would be subsequently used to train the model

`pandas.notnull(obj)`
> Detect non-missing values for an array-like object.

In [4]:
fuel_interest = fuel_data[fuel_data.fuel_unit.notnull()][['fuel_type_code_pudl', 'fuel_unit']]

In [5]:
fuel_interest

Unnamed: 0,fuel_type_code_pudl,fuel_unit
0,coal,ton
1,coal,ton
2,coal,ton
3,coal,ton
4,gas,mcf
...,...,...
29518,gas,mcf
29519,gas,mcf
29520,gas,mcf
29521,coal,ton


In [6]:
fuel_type = {fuel: category for category, fuel in enumerate(fuel_interest.fuel_type_code_pudl.unique())}

In [7]:
fuel_type

{'coal': 0, 'gas': 1, 'nuclear': 2, 'oil': 3, 'waste': 4, 'other': 5}

In [8]:
def fuel_categorical(data):
    """
    This function converts the entries of the `fuel_type_code_pudl` variable from strings to nominal data
    
    :param
    data: array_like 
    `fuel_type_code_pudl` entries in the `fuel_interest` DataFrame
    
    :return:
    nominal equivalent of entry as defined in the `fuel_type` dictionary
    """
    if data in fuel_type.keys():
        data = fuel_type[data]
    return data

In [9]:
fuel_unit = {unit: category for category, unit in enumerate(fuel_interest.fuel_unit.unique())}

In [10]:
fuel_unit

{'ton': 0,
 'mcf': 1,
 'kgU': 2,
 'bbl': 3,
 'gramsU': 4,
 'mwdth': 5,
 'mmbtu': 6,
 'mwhth': 7,
 'gal': 8}

In [11]:
def unit_categorical(data):
    """
    This function converts the entries of the `fuel_unit` variable from strings to nominal data
    
    :param
    data: array_like 
    `fuel_unit` entries in the `fuel_interest` DataFrame
    
    :return:
    nominal equivalent of entry as defined in the `fuel_unit` dictionary
    """
    if data in fuel_unit.keys():
        data = fuel_unit[data]
    return data

In [12]:
fuel_interest.fuel_type_code_pudl = list(map(fuel_categorical, fuel_interest.fuel_type_code_pudl))

In [13]:
fuel_interest.fuel_unit = list(map(unit_categorical, fuel_interest.fuel_unit))

In [14]:
fuel_interest.nunique()

fuel_type_code_pudl    6
fuel_unit              9
dtype: int64

Converting the columns of interest into NumPy arrays as this is the base for the `sklearn` module

$X$ = ``fuel_interest.fuel_type_code_pudl`` is the predictor variable\
$y$ = ``fuel_interest.fuel_unit`` is the outcome variable

In [15]:
X = np.array(fuel_interest.fuel_type_code_pudl)

In [16]:
X.shape

(29343,)

In [17]:
y = np.array(fuel_interest.fuel_unit)

In [18]:
y.shape

(29343,)

In [19]:
from sklearn.model_selection import train_test_split

The data is split into training and testing sets to be able to measure the accuracy of our model with the test set

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0, shuffle = True)

In [21]:
knn = KNeighborsClassifier(n_neighbors=3)

In [22]:
knn.fit(X_train.reshape(-1, 1), y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [23]:
y_pred = knn.predict(X_test.reshape(-1, 1))

In [24]:
y_pred

array([1, 1, 3, ..., 3, 1, 0], dtype=int64)

In [25]:
print('Test set score: {:.2f}'.format(np.mean(y_pred == y_test)))

Test set score: 0.98


___
Our model has a score of 0.98 which is quite accurate
___

Predict the missing values in the fuel_data dataset

In [26]:
# Extract the observations from the `fuel_data` with missing values

fuel_missing = fuel_data[~fuel_data.index.isin(fuel_interest.index)][['fuel_type_code_pudl', 'fuel_unit']]

In [27]:
fuel_missing

Unnamed: 0,fuel_type_code_pudl,fuel_unit
66,oil,
152,nuclear,
277,other,
626,nuclear,
627,nuclear,
...,...,...
28413,gas,
28828,oil,
28830,oil,
29052,coal,


In [28]:
fuel_missing.fuel_type_code_pudl = list(map(fuel_categorical, fuel_missing.fuel_type_code_pudl))

In [29]:
fuel_missing

Unnamed: 0,fuel_type_code_pudl,fuel_unit
66,3,
152,2,
277,5,
626,2,
627,2,
...,...,...
28413,1,
28828,3,
28830,3,
29052,0,


In [30]:
X_missing = np.array(fuel_missing.fuel_type_code_pudl).reshape(-1, 1)

In [31]:
X_missing

array([[3],
       [2],
       [5],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [3],
       [2],
       [2],
       [2],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [5],
       [1],
       [4],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [4],
       [1],
       [4],
       [4],
       [0],
       [0],
       [0],
       [5],
       [1],
       [5],
       [2],
       [1],
       [5],
       [2],
       [1],
       [0],
       [1],
       [5],
       [5],
       [1],
       [0],
       [2],
       [1],
       [5],
       [2],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [5],
       [0],
       [5],
    

In [32]:
unit_pred = knn.predict(X_missing)

In [33]:
unit_pred

array([3, 4, 3, 4, 4, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 3, 4, 4, 4,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 1, 3, 4, 1, 3, 4, 1, 0, 1, 3, 3, 1,
       0, 4, 1, 3, 4, 1, 0, 0, 1, 0, 1, 1, 1, 1, 3, 0, 3, 0, 1, 0, 1, 1,
       4, 1, 3, 0, 1, 3, 4, 1, 3, 0, 3, 4, 1, 1, 3, 4, 0, 3, 0, 1, 0, 1,
       1, 1, 1, 3, 3, 1, 1, 3, 1, 3, 1, 3, 1, 3, 3, 1, 3, 3, 0, 3, 3, 3,
       3, 3, 3, 3, 0, 3, 1, 3, 3, 0, 3, 3, 3, 1, 0, 3, 3, 0, 3, 3, 3, 3,
       0, 3, 3, 0, 3, 3, 3, 1, 1, 1, 0, 3, 0, 4, 0, 3, 1, 1, 1, 0, 3, 1,
       3, 3, 0, 3], dtype=int64)

In [34]:
fuel_missing.fuel_unit = unit_pred.astype(str)

In [35]:
fuel_missing

Unnamed: 0,fuel_type_code_pudl,fuel_unit
66,3,3
152,2,4
277,5,3
626,2,4
627,2,4
...,...,...
28413,1,1
28828,3,3
28830,3,3
29052,0,0


In [36]:
rev_fuel_type = {category: fuel for category, fuel in enumerate(fuel_data.fuel_type_code_pudl.unique())}

In [37]:
rev_fuel_type

{0: 'coal', 1: 'gas', 2: 'nuclear', 3: 'oil', 4: 'waste', 5: 'other'}

In [38]:
rev_fuel_unit = {category: unit for category, unit in enumerate(fuel_data.fuel_unit.unique())}

In [39]:
rev_fuel_unit

{0: 'ton',
 1: 'mcf',
 2: 'kgU',
 3: 'bbl',
 4: 'gramsU',
 5: nan,
 6: 'mwdth',
 7: 'mmbtu',
 8: 'mwhth',
 9: 'gal'}

The subsequent funtions convert the nominal data in both columns of our DataFrame back to their appropriate values based on data contained in the `rev_fuel_type` and `rev_fuel_unit` __dicts__

In [40]:
def rev_fuel_categorical(data):
    output = str()
    if data in rev_fuel_type.keys():
        output = rev_fuel_type[data]
    return output

In [41]:
def rev_unit_categorical(data):
    output = str()
    if data in rev_fuel_unit.keys():
        output = rev_fuel_unit[data]
    return output

In [42]:
fuel_missing.fuel_type_code_pudl = list(map(rev_fuel_categorical, fuel_missing.fuel_type_code_pudl))

In [43]:
fuel_missing.fuel_unit = list(map(rev_unit_categorical, unit_pred))

In [44]:
fuel_missing

Unnamed: 0,fuel_type_code_pudl,fuel_unit
66,oil,bbl
152,nuclear,gramsU
277,other,bbl
626,nuclear,gramsU
627,nuclear,gramsU
...,...,...
28413,gas,mcf
28828,oil,bbl
28830,oil,bbl
29052,coal,ton
