In [2]:
from statsmodels.regression.linear_model import OLS
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import sqlite3
import pandas as pd
pd.set_option("display.max_columns", 500)

# Import Data

In [3]:
conn = sqlite3.connect("eia")
cursor = conn.cursor()


In [4]:
df = pd.DataFrame(cursor.execute("""select * from combined2""").fetchall())
df.columns = [description[0] for description in cursor.description]

In [5]:
df = df.drop(columns=["index"])

In [6]:
df["year_month"] = df.apply(lambda x: str(x["year"]) + "-" + str(x["month"]).zfill(2), axis=1)

# Cleaning new fields

In [7]:
df["total_fuel_consumption_mmbtu"] = df["total_fuel_consumption_mmbtu"].apply(lambda x: str(x).replace(",", ""))
df["total_fuel_consumption_mmbtu"] = df["total_fuel_consumption_mmbtu"].astype(int)

In [8]:
def none_to_zero(x):
    if (x=="None") or (x == "."):
        return 0
    else:
        return x

In [9]:
fields_to_clean = ["total_fuel_consumption_quantity", "electric_fuel_consumption_quantity", "total_fuel_consumption_mmbtu", "elec_fuel_consumption_mmbtu", "net_generation_megawatthours", "quantity_gen", "elec_quantity_gen", "tot_mmbtu_gen", "elec_mmbtu_gen", "netgen_gen"]


In [10]:
for fld in fields_to_clean:
    df[fld] = df[fld].apply(lambda x: str(x).replace(",", ""))

In [11]:
for fld in fields_to_clean:
    df[fld] = df[fld].apply(none_to_zero)


In [12]:
for fld in fields_to_clean:
    df[fld] = df[fld].apply(lambda x: str(x).replace(",", ""))


In [13]:
for fld in fields_to_clean:
    df[fld] = df[fld].astype(int)

In [14]:
# if fuel_cost isnt defined, its no use to us
df = df[df.fuel_cost.notna()].copy()

In [15]:
def try_float(x):
    if x =="" or x == None or x == ".":
        return 0
    else:
        return float(x)
df["mmbtu_per_unit_gen"] = df["mmbtu_per_unit_gen"].apply(try_float)

In [16]:
df["mmbtu_per_unit_gen"] = df["mmbtu_per_unit_gen"].astype(float)

In [17]:
df["operator_id"] = df["operator_id"].apply(lambda x: x.replace("operator_", ""))

In [18]:
df["operator_id"] = df["operator_id"].apply(lambda x: int(round(float(x))))

In [20]:
for col in df.columns:
    if "_id" in col:
        try:
            df[col] = df[col].astype(int)
        except:
            print(col)

coalmine_msha_id
nuclear_unit_id
operator_id_gen


In [23]:
df.coalmine_msha_id = df.coalmine_msha_id.fillna(value=0)
df.nuclear_unit_id = df.nuclear_unit_id.fillna(value=0)
df.operator_id_gen = df.operator_id_gen.fillna(value=0)



In [28]:
# For category columns, switch None values to N/A
def none_to_na(x):
    if x == None:
        return "N/A"
    else:
        return x
df.contract_type = df.contract_type.apply(none_to_na)
df.natural_gas_delivery_contract_type = df.natural_gas_delivery_contract_type.apply(none_to_na)
df.natural_gas_supply_contract_type = df.natural_gas_supply_contract_type.apply(none_to_na)
df.natural_gas_transportation_service = df.natural_gas_transportation_service.apply(none_to_na)
df.primary_transportation_mode = df.primary_transportation_mode.apply(none_to_na)
df.secondary_transportation_mode = df.secondary_transportation_mode.apply(none_to_na)
df.nerc_region = df.nerc_region.apply(none_to_na)
df.purchase_type = df.purchase_type.apply(none_to_na)

In [29]:
for col in df.columns:
    if "_id" in col:
        try:
            df[col] = df[col].astype(str)
        except:
            print(col)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440666 entries, 0 to 532619
Data columns (total 65 columns):
average_ash_content                   440666 non-null float64
average_heat_content                  440666 non-null float64
average_mercury_content               193405 non-null float64
average_sulfur_content                440666 non-null float64
chlorine_content                      9106 non-null float64
coalmine_county                       137172 non-null object
coalmine_msha_id                      440666 non-null object
coalmine_name                         136322 non-null object
coalmine_state                        138337 non-null object
coalmine_type                         138122 non-null object
contract_expiration_date              340240 non-null object
contract_type                         440666 non-null object
energy_source                         440666 non-null object
fuel_cost                             440666 non-null float64
fuel_group                     

In [33]:
cursor.execute("""drop table combined4""")

<sqlite3.Cursor at 0x1a1e2dd340>

In [32]:
df.to_sql("combined4", conn)

In [25]:
len(df)

440666