In [1]:
# Import dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# Read the 'gas production' csv file into a pandas DataFrame
gas_prod = pd.read_csv('../data/clean_data/Gas Production - EJ-YearFixed-Python.csv') #, index_col=0
gas_prod.head()

Unnamed: 0,Year,Algeria,Argentina,Australia,Azerbaijan,Bahrain,Bangladesh,Bolivia,Brazil,Brunei,...,Turkmenistan,Ukraine,United Arab Emirates,United Kingdom,US,USSR,Uzbekistan,Venezuela,Vietnam,Yemen
0,1970,0.09,0.21,0.06,,0.02,,,,0.01,...,,,0.03,0.39,20.57,6.75,,0.31,,
1,1971,0.09,0.23,0.09,,0.03,,,,0.01,...,,,0.05,0.66,21.16,7.24,,0.3,,
2,1972,0.12,0.22,0.13,,0.04,0.02,0.04,0.01,0.02,...,,,0.05,0.95,21.09,7.55,,0.3,,
3,1973,0.16,0.24,0.17,,0.06,0.02,0.06,0.01,0.07,...,,,0.06,1.03,21.07,8.06,,0.37,,
4,1974,0.18,0.25,0.19,,0.07,0.02,0.06,0.01,0.15,...,,,0.06,1.24,20.14,8.89,,0.39,,


In [3]:
# Replace all the blanks (or NaN) with zero
clean_gas_prod = gas_prod.replace(np.nan,0)
clean_gas_prod.tail()

Unnamed: 0,Year,Algeria,Argentina,Australia,Azerbaijan,Bahrain,Bangladesh,Bolivia,Brazil,Brunei,...,Turkmenistan,Ukraine,United Arab Emirates,United Kingdom,US,USSR,Uzbekistan,Venezuela,Vietnam,Yemen
45,2015,2.93,1.28,2.74,0.68,0.53,0.93,0.7,0.86,0.48,...,2.37,0.68,2.11,1.46,26.65,0.0,1.93,1.3,0.37,0.1
46,2016,3.29,1.34,3.47,0.66,0.52,0.95,0.68,0.87,0.47,...,2.28,0.68,2.17,1.5,26.18,0.0,1.91,1.34,0.37,0.02
47,2017,3.35,1.34,4.06,0.64,0.52,0.96,0.66,0.98,0.46,...,2.11,0.7,2.24,1.51,26.86,0.0,1.92,1.39,0.34,0.02
48,2018,3.38,1.42,4.68,0.69,0.53,0.96,0.61,0.91,0.45,...,2.21,0.71,2.21,1.46,30.09,0.0,2.06,1.14,0.35,0.02
49,2019,3.1,1.5,5.52,0.88,0.61,1.03,0.54,0.93,0.47,...,2.27,0.71,2.25,1.43,33.15,0.0,2.03,0.95,0.35,0.02


In [4]:
# Read the 'world population' csv file into pandas; adding in 'Population' feature
global_pop = pd.read_csv('../data/clean_data/WorldPopulationbyYear.csv')
global_pop.tail()

Unnamed: 0,Year,World
55,2015,7338964960
56,2016,7424282488
57,2017,7509065705
58,2018,7591932907
59,2019,7673533972


In [5]:
# Remove any data on years prior to 1970;
# to match up with the gas_prod data
# although not necessary when the merge takes care of it
g_pop = global_pop[(global_pop["Year"] >= 1970)]
g_pop.head()

Unnamed: 0,Year,World
10,1970,3682911039
11,1971,3760509002
12,1972,3836892580
13,1973,3912347640
14,1974,3988478324


In [6]:
# Merge the 'gas production' with 'world population by year'
gas_pop = g_pop.merge(clean_gas_prod, on="Year")
gas_pop.head()

Unnamed: 0,Year,World,Algeria,Argentina,Australia,Azerbaijan,Bahrain,Bangladesh,Bolivia,Brazil,...,Turkmenistan,Ukraine,United Arab Emirates,United Kingdom,US,USSR,Uzbekistan,Venezuela,Vietnam,Yemen
0,1970,3682911039,0.09,0.21,0.06,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.03,0.39,20.57,6.75,0.0,0.31,0.0,0.0
1,1971,3760509002,0.09,0.23,0.09,0.0,0.03,0.0,0.0,0.0,...,0.0,0.0,0.05,0.66,21.16,7.24,0.0,0.3,0.0,0.0
2,1972,3836892580,0.12,0.22,0.13,0.0,0.04,0.02,0.04,0.01,...,0.0,0.0,0.05,0.95,21.09,7.55,0.0,0.3,0.0,0.0
3,1973,3912347640,0.16,0.24,0.17,0.0,0.06,0.02,0.06,0.01,...,0.0,0.0,0.06,1.03,21.07,8.06,0.0,0.37,0.0,0.0
4,1974,3988478324,0.18,0.25,0.19,0.0,0.07,0.02,0.06,0.01,...,0.0,0.0,0.06,1.24,20.14,8.89,0.0,0.39,0.0,0.0


In [7]:
# Renaming columns 
# new_df = df.rename(columns={"A": "a", "B": "c"})
gas_pop = gas_pop.rename(columns={"World": "Total Population", "Total World": "Total Exajoules"})
gas_pop.tail()

Unnamed: 0,Year,Total Population,Algeria,Argentina,Australia,Azerbaijan,Bahrain,Bangladesh,Bolivia,Brazil,...,Turkmenistan,Ukraine,United Arab Emirates,United Kingdom,US,USSR,Uzbekistan,Venezuela,Vietnam,Yemen
45,2015,7338964960,2.93,1.28,2.74,0.68,0.53,0.93,0.7,0.86,...,2.37,0.68,2.11,1.46,26.65,0.0,1.93,1.3,0.37,0.1
46,2016,7424282488,3.29,1.34,3.47,0.66,0.52,0.95,0.68,0.87,...,2.28,0.68,2.17,1.5,26.18,0.0,1.91,1.34,0.37,0.02
47,2017,7509065705,3.35,1.34,4.06,0.64,0.52,0.96,0.66,0.98,...,2.11,0.7,2.24,1.51,26.86,0.0,1.92,1.39,0.34,0.02
48,2018,7591932907,3.38,1.42,4.68,0.69,0.53,0.96,0.61,0.91,...,2.21,0.71,2.21,1.46,30.09,0.0,2.06,1.14,0.35,0.02
49,2019,7673533972,3.1,1.5,5.52,0.88,0.61,1.03,0.54,0.93,...,2.27,0.71,2.25,1.43,33.15,0.0,2.03,0.95,0.35,0.02


In [8]:
# Adding in 'GDP' as a feature
# Read the 'GDP' csv file into pandas; adding in 'Population' feature
gdp_percent = pd.read_csv('../data/clean_data/GDP%-YearFixed-Python.csv', encoding = 'ISO-8859-1')
gdp_percent.head()

Unnamed: 0,Year,Afghanistan,Albania,Algeria,American Samoa,Andorra,Angola,Antigua and Barbuda,Arab World,Argentina,...,Uzbekistan,Vanuatu,"Venezuela, RB",Vietnam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
0,1965,,,,,,,,,,...,,,,,,,,,,
1,1966,,,,,,,,,,...,,,,,,,,,,
2,1967,,,,,,,,,,...,,,,,,,,,,
3,1968,,,,,,,,,,...,,,,,,,,,,
4,1969,,,,,,,,,,...,,,,,,,,,,


In [9]:
# Replace all the blanks (or NaN) with zero
clean_gdp_percent = gdp_percent.replace(np.nan,0)
clean_gdp_percent.tail()

Unnamed: 0,Year,Afghanistan,Albania,Algeria,American Samoa,Andorra,Angola,Antigua and Barbuda,Arab World,Argentina,...,Uzbekistan,Vanuatu,"Venezuela, RB",Vietnam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
50,2015,-0.661709,1.896174,4.784447,0.0,0.0,-21.531694,0.968993,1.814077,0.0,...,0.0,2.483269,121.738085,0.631201,0.0,1.431611,1.39333,0.0,10.110593,-2.4095
51,2016,4.383892,1.275432,6.397695,0.0,0.0,32.377734,-0.489438,2.06884,0.0,...,0.0,0.842069,254.948535,2.668248,0.0,-0.219107,1.486007,0.0,17.86973,-1.566413
52,2017,4.975952,1.986661,5.591116,0.0,0.0,31.691686,2.432488,1.966826,0.0,...,0.0,3.084526,0.0,3.520257,0.0,0.212571,2.233522,0.0,6.577312,0.909733
53,2018,0.626149,2.02806,4.26999,0.0,0.0,20.190374,1.207158,2.458142,34.277224,...,0.0,2.330964,0.0,3.539628,0.0,-0.195108,2.458142,0.0,7.494572,0.0
54,2019,2.302373,1.411091,1.951768,0.0,0.0,17.14532,0.0,1.336016,53.548304,...,0.0,2.76252,0.0,2.795824,0.0,1.580183,2.318441,0.0,9.150316,0.0


In [10]:
filtered_gdp = clean_gdp_percent[['Year', 'United States', 'World']]
filtered_gdp = filtered_gdp.rename(columns={'United States': 'US GDP (%)', 'World': 'World GDP (%)'})
filtered_gdp.head()

Unnamed: 0,Year,US GDP %,World GDP %
0,1965,1.585169,0.0
1,1966,3.015075,0.0
2,1967,2.772786,0.0
3,1968,4.271796,0.0
4,1969,5.462386,0.0


In [11]:
# Merge the latest dataframe with 'filtered GDP'
gas_prod_v1 = filtered_gdp.merge(gas_pop, on="Year")
gas_prod_v1.head()

Unnamed: 0,Year,US GDP %,World GDP %,Total Population,Algeria,Argentina,Australia,Azerbaijan,Bahrain,Bangladesh,...,Turkmenistan,Ukraine,United Arab Emirates,United Kingdom,US,USSR,Uzbekistan,Venezuela,Vietnam,Yemen
0,1970,5.838255,0.0,3682911039,0.09,0.21,0.06,0.0,0.02,0.0,...,0.0,0.0,0.03,0.39,20.57,6.75,0.0,0.31,0.0,0.0
1,1971,4.292767,0.0,3760509002,0.09,0.23,0.09,0.0,0.03,0.0,...,0.0,0.0,0.05,0.66,21.16,7.24,0.0,0.3,0.0,0.0
2,1972,3.272278,0.0,3836892580,0.12,0.22,0.13,0.0,0.04,0.02,...,0.0,0.0,0.05,0.95,21.09,7.55,0.0,0.3,0.0,0.0
3,1973,6.17776,0.0,3912347640,0.16,0.24,0.17,0.0,0.06,0.02,...,0.0,0.0,0.06,1.03,21.07,8.06,0.0,0.37,0.0,0.0
4,1974,11.054805,0.0,3988478324,0.18,0.25,0.19,0.0,0.07,0.02,...,0.0,0.0,0.06,1.24,20.14,8.89,0.0,0.39,0.0,0.0


In [12]:
# Adding in 'Inflation' as a feature
# Read the 'Inflation' csv file into pandas
inflation = pd.read_csv('../data/clean_data/InflationAnnual%-YearFixed-Python.csv', encoding = 'ISO-8859-1')
inflation.tail()

Unnamed: 0,Year,Afghanistan,Albania,Algeria,American Samoa,Andorra,Angola,Antigua and Barbuda,Arab World,Argentina,...,Uzbekistan,Vanuatu,"Venezuela, RB",Vietnam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
50,2015,-0.661709,1.896174,4.784447,,,-21.531694,0.968993,1.814077,,...,,2.483269,121.738085,0.631201,,1.431611,1.39333,,10.110593,-2.4095
51,2016,4.383892,1.275432,6.397695,,,32.377734,-0.489438,2.06884,,...,,0.842069,254.948535,2.668248,,-0.219107,1.486007,,17.86973,-1.566413
52,2017,4.975952,1.986661,5.591116,,,31.691686,2.432488,1.966826,,...,,3.084526,,3.520257,,0.212571,2.233522,,6.577312,0.909733
53,2018,0.626149,2.02806,4.26999,,,20.190374,1.207158,2.458142,34.277224,...,,2.330964,,3.539628,,-0.195108,2.458142,,7.494572,
54,2019,2.302373,1.411091,1.951768,,,17.14532,,1.336016,53.548304,...,,2.76252,,2.795824,,1.580183,2.318441,,9.150316,


In [16]:
# Replace all the blanks (or NaN) with zero
clean_inf = inflation.replace(np.nan,0)
clean_inf.tail()

Unnamed: 0,Year,Afghanistan,Albania,Algeria,American Samoa,Andorra,Angola,Antigua and Barbuda,Arab World,Argentina,...,Uzbekistan,Vanuatu,"Venezuela, RB",Vietnam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
50,2015,-0.661709,1.896174,4.784447,0.0,0.0,-21.531694,0.968993,1.814077,0.0,...,0.0,2.483269,121.738085,0.631201,0.0,1.431611,1.39333,0.0,10.110593,-2.4095
51,2016,4.383892,1.275432,6.397695,0.0,0.0,32.377734,-0.489438,2.06884,0.0,...,0.0,0.842069,254.948535,2.668248,0.0,-0.219107,1.486007,0.0,17.86973,-1.566413
52,2017,4.975952,1.986661,5.591116,0.0,0.0,31.691686,2.432488,1.966826,0.0,...,0.0,3.084526,0.0,3.520257,0.0,0.212571,2.233522,0.0,6.577312,0.909733
53,2018,0.626149,2.02806,4.26999,0.0,0.0,20.190374,1.207158,2.458142,34.277224,...,0.0,2.330964,0.0,3.539628,0.0,-0.195108,2.458142,0.0,7.494572,0.0
54,2019,2.302373,1.411091,1.951768,0.0,0.0,17.14532,0.0,1.336016,53.548304,...,0.0,2.76252,0.0,2.795824,0.0,1.580183,2.318441,0.0,9.150316,0.0


In [17]:
filtered_inf = clean_inf[['Year', 'United States', 'World']]
filtered_inf = filtered_inf.rename(columns={'United States': 'US Inflation (Annual %)', 'World': 'World Inflation (Annual %)'})
filtered_inf.head()

Unnamed: 0,Year,US Inflation Annual %,World Inflation Annual %
0,1965,1.585169,0.0
1,1966,3.015075,0.0
2,1967,2.772786,0.0
3,1968,4.271796,0.0
4,1969,5.462386,0.0


In [19]:
# Merge the latest dataframe with 'filtered inflation'
gas_prod_v2 = filtered_inf.merge(gas_prod_v1, on="Year")
gas_prod_v2.head()

Unnamed: 0,Year,US Inflation Annual %,World Inflation Annual %,US GDP %,World GDP %,Total Population,Algeria,Argentina,Australia,Azerbaijan,...,Turkmenistan,Ukraine,United Arab Emirates,United Kingdom,US,USSR,Uzbekistan,Venezuela,Vietnam,Yemen
0,1970,5.838255,0.0,5.838255,0.0,3682911039,0.09,0.21,0.06,0.0,...,0.0,0.0,0.03,0.39,20.57,6.75,0.0,0.31,0.0,0.0
1,1971,4.292767,0.0,4.292767,0.0,3760509002,0.09,0.23,0.09,0.0,...,0.0,0.0,0.05,0.66,21.16,7.24,0.0,0.3,0.0,0.0
2,1972,3.272278,0.0,3.272278,0.0,3836892580,0.12,0.22,0.13,0.0,...,0.0,0.0,0.05,0.95,21.09,7.55,0.0,0.3,0.0,0.0
3,1973,6.17776,0.0,6.17776,0.0,3912347640,0.16,0.24,0.17,0.0,...,0.0,0.0,0.06,1.03,21.07,8.06,0.0,0.37,0.0,0.0
4,1974,11.054805,0.0,11.054805,0.0,3988478324,0.18,0.25,0.19,0.0,...,0.0,0.0,0.06,1.24,20.14,8.89,0.0,0.39,0.0,0.0


In [22]:
# Adding in 'Taxes on goods & services' as a feature
# Read the 'Taxes' csv file into pandas
taxes = pd.read_csv('../data/clean_data/Taxes%-YearFixed-Python.csv', encoding = 'ISO-8859-1')
taxes.tail()

Unnamed: 0,Year,Afghanistan,Albania,Algeria,American Samoa,Andorra,Angola,Antigua and Barbuda,Arab World,Argentina,...,Uzbekistan,Vanuatu,"Venezuela, RB",Vietnam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
50,2015,2.654241,54.627694,,,,6.740552,,,28.000035,...,35.128015,37.645983,,,,,33.724915,,32.436842,46.878657
51,2016,7.055293,55.926813,,,,8.06309,,,27.241612,...,32.963595,36.204418,,,,,34.248831,,27.856806,46.781836
52,2017,8.598224,53.294789,,,,7.445482,,,30.252266,...,38.608093,37.694728,,,,,33.333664,,36.894517,47.064242
53,2018,,53.132719,,,,,,,33.34822,...,44.812047,35.406407,,,,,34.011405,,38.004259,41.234977
54,2019,,,,,,,,,,...,,,,,,,,,,


In [24]:
# Replace all the blanks (or NaN) with zero
clean_tax = taxes.replace(np.nan,0)
clean_tax.tail()

Unnamed: 0,Year,Afghanistan,Albania,Algeria,American Samoa,Andorra,Angola,Antigua and Barbuda,Arab World,Argentina,...,Uzbekistan,Vanuatu,"Venezuela, RB",Vietnam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
50,2015,2.654241,54.627694,0.0,0.0,0.0,6.740552,0.0,0.0,28.000035,...,35.128015,37.645983,0.0,0.0,0.0,0.0,33.724915,0.0,32.436842,46.878657
51,2016,7.055293,55.926813,0.0,0.0,0.0,8.06309,0.0,0.0,27.241612,...,32.963595,36.204418,0.0,0.0,0.0,0.0,34.248831,0.0,27.856806,46.781836
52,2017,8.598224,53.294789,0.0,0.0,0.0,7.445482,0.0,0.0,30.252266,...,38.608093,37.694728,0.0,0.0,0.0,0.0,33.333664,0.0,36.894517,47.064242
53,2018,0.0,53.132719,0.0,0.0,0.0,0.0,0.0,0.0,33.34822,...,44.812047,35.406407,0.0,0.0,0.0,0.0,34.011405,0.0,38.004259,41.234977
54,2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
filtered_tax = clean_tax[['Year', 'United States', 'World']]
filtered_tax = filtered_tax.rename(columns={'United States': 'US Tax (% on Goods & Services)', 'World': 'World Tax (% on Goods & Services)'})
filtered_tax.head()

Unnamed: 0,Year,US Tax (% on Goods & Services),World Tax (% on Goods & Services)
0,1965,0.0,0.0
1,1966,0.0,0.0
2,1967,0.0,0.0
3,1968,0.0,0.0
4,1969,0.0,0.0


In [26]:
# Merge the latest dataframe with 'filtered taxes'
final_gas_prod = filtered_tax.merge(gas_prod_v2, on="Year")
final_gas_prod.head()

Unnamed: 0,Year,US Tax (% on Goods & Services),World Tax (% on Goods & Services),US Inflation Annual %,World Inflation Annual %,US GDP %,World GDP %,Total Population,Algeria,Argentina,...,Turkmenistan,Ukraine,United Arab Emirates,United Kingdom,US,USSR,Uzbekistan,Venezuela,Vietnam,Yemen
0,1970,0.0,0.0,5.838255,0.0,5.838255,0.0,3682911039,0.09,0.21,...,0.0,0.0,0.03,0.39,20.57,6.75,0.0,0.31,0.0,0.0
1,1971,0.0,0.0,4.292767,0.0,4.292767,0.0,3760509002,0.09,0.23,...,0.0,0.0,0.05,0.66,21.16,7.24,0.0,0.3,0.0,0.0
2,1972,7.143859,0.0,3.272278,0.0,3.272278,0.0,3836892580,0.12,0.22,...,0.0,0.0,0.05,0.95,21.09,7.55,0.0,0.3,0.0,0.0
3,1973,6.579487,0.0,6.17776,0.0,6.17776,0.0,3912347640,0.16,0.24,...,0.0,0.0,0.06,1.03,21.07,8.06,0.0,0.37,0.0,0.0
4,1974,5.990202,0.0,11.054805,0.0,11.054805,0.0,3988478324,0.18,0.25,...,0.0,0.0,0.06,1.24,20.14,8.89,0.0,0.39,0.0,0.0


In [None]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape to create this

# Using decades 1980-2000 to help us setup for future predictions (therefore, I use [10:30])

X = gas_pop["Total Population"][10:30].values.reshape(-1, 1) 
y = gas_pop["Total Exajoules"][10:30].values.reshape(-1, 1)

print("Shape: ", X.shape, y.shape)

In [None]:
x = gas_pop['Year']
y1 = gas_pop['Total Exajoules']
y2 = gas_pop['Total Population']

plt.figure(figsize=(20,5))

plt.subplot(131)
plt.plot(x, y1, color='b')
plt.subplot(132)
plt.plot(x, y2, color='g')
plt.show()

# Come back to add labels and customize ------------------------------------------------------------------------------------

In [None]:
# Plot the data to see if a linear trend exists 
gas_pop.plot(kind ="line", color="blue", x="Year", y="Total Exajoules")

# Come back to add labels and customize ------------------------------------------------------------------------------------

In [None]:
# Plot the data to see if a linear trend exists 
gas_pop.plot(kind="line", color="green", x="Year", y="Total Population") 

# Come back to add labels and customize ------------------------------------------------------------------------------------

In [None]:
# Use sklearn's `train_test_split` to split the data into training and testing
from sklearn.model_selection import train_test_split

### BEGIN SOLUTION
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
### END SOLUTION

In [None]:
# Create the model

### BEGIN SOLUTION
from sklearn.linear_model import LinearRegression
model = LinearRegression()
### END SOLUTION

In [None]:
world_gas_2000 = gas_pop['Total Population'][30].reshape(-1, 1) 
world_gas_2000.shape

In [None]:
exajoules_2000 = gas_pop['Total Exajoules'][30].reshape(-1, 1)
exajoules_2000.shape

In [None]:
# Fit the model to the data 
from sklearn.metrics import mean_squared_error, r2_score
fitment = model.fit(X, y)

predicted = fitment.predict(world_gas_2000)

print(f"Prediction: {predicted}")
print(f"Actual: {exajoules_2000}")

### BEGIN SOLUTION
# model.fit(X_train, y_train)
### END SOLUTION

In [None]:
fitment.predict(gas_pop['Total Population'].to_numpy().reshape(-1, 1)) # Total world population

In [None]:
gas_pop.loc[gas_pop['Year'].between(1980, 2000), ['Total Population', 'Total Exajoules']]

In [None]:
# A 'for loop' to make predictions for 2001-2010 (historical)
# Move window one year ahead each time

predict0110 = []
actual0110 = []
    
for year in range(20):
    start = 1980 + year
    end = 2000 + year
    
#     print(start)
#     print(end)

    xy = gas_pop.loc[gas_pop['Year'].between(start, end), ['Total Population', 'Total Exajoules']]
    X = xy["Total Population"].values.reshape(-1, 1)
    y = xy["Total Exajoules"].values.reshape(-1, 1)
    fitment = model.fit(X, y)
    gas_prod_predict = fitment.predict(xy['Total Population'].iloc[20].reshape(-1, 1))
    print(gas_prod_predict)
    predict0110.append(gas_prod_predict)

#     print(xy['Total Population'].iloc[20])
#     exajoules_loop = fitment.predict(xy['Total Exajoules'].to_numpy().reshape(-1, 1))

In [None]:
# Adding historical predictions to view comparison w/ actual values


In [None]:
# --------------- Divider between GAS PRODUCTION & GAS CONSUMPTION ---------------

In [None]:
# Read the 'gas consumption' csv file into a DataFrame
gas_csmp = pd.read_csv('../data/clean_data/Gas Consumption - EJ-YearFixed-Python.csv') # , index_col=0
gas_csmp.head()

In [None]:
# Replace all the blanks (or NaN) with zero
clean_gas_csmp = gas_csmp.replace(np.nan,0)
clean_gas_csmp.tail()

In [None]:
# Merge the 'gas consumption' with 'world population by year'; adding in 'Population' feature
gas_pop2 = g_pop.merge(clean_gas_csmp, on="Year")
gas_pop2

In [None]:
# Renaming columns 
# new_df = df.rename(columns={"A": "a", "B": "c"})
gas_pop2 = gas_pop2.rename(columns={"World": "Total Population", "Total World": "Total Exajoules"})
gas_pop2

In [None]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape to create this

X = gas_pop2["Total Population"][10:30].values.reshape(-1, 1)
y = gas_pop2["Total Exajoules"][10:30].values.reshape(-1, 1)

print("Shape: ", X.shape, y.shape)

In [None]:
x = gas_pop2['Year']
y1 = gas_pop2['Total Exajoules']
y2 = gas_pop2['Total Population']

plt.figure(figsize=(20,5))

plt.subplot(131)
plt.plot(x, y1, color='b')
plt.subplot(132)
plt.plot(x, y2, color='g')
plt.show()

# Come back to add labels and customize ------------------------------------------------------------------------------------

In [None]:
# Use sklearn's `train_test_split` to split the data into training and testing
from sklearn.model_selection import train_test_split

### BEGIN SOLUTION
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
### END SOLUTION

In [None]:
# Create the model

### BEGIN SOLUTION
from sklearn.linear_model import LinearRegression
model = LinearRegression()
### END SOLUTION

In [None]:
world_gas_2000 = gas_pop2['Total Population'][30].reshape(-1, 1) 
world_gas_2000.shape

In [None]:
exajoules_2000 = gas_pop['Total Exajoules'][30].reshape(-1, 1)
exajoules_2000.shape

In [None]:
# Fit the model to the data 
from sklearn.metrics import mean_squared_error, r2_score
fitment = model.fit(X, y)

predicted = fitment.predict(world_gas_2000)

print(f"Prediction: {predicted}")
print(f"Actual: {exajoules_2000}")

### BEGIN SOLUTION
# model.fit(X_train, y_train)
### END SOLUTION

In [None]:
fitment.predict(gas_pop2['Total Population'].to_numpy().reshape(-1, 1)) # Total world population

In [None]:
gas_pop2.loc[gas_pop2['Year'].between(1980, 2000), ['Total Population', 'Total Exajoules']]

In [None]:
# A 'for loop' to make predictions for 2001-2010 (historical)
# Move window one year ahead each time

predict0110 = []
actual0110 = []
    
for year in range(20):
    start = 1980 + year
    end = 2000 + year
    
#     print(start)
#     print(end)

    xy = gas_pop2.loc[gas_pop2['Year'].between(start, end), ['Total Population', 'Total Exajoules']]
    X = xy["Total Population"].values.reshape(-1, 1)
    y = xy["Total Exajoules"].values.reshape(-1, 1)
    fitment = model.fit(X, y)
    gas_csmp_predict = fitment.predict(xy['Total Population'].iloc[20].reshape(-1, 1))
    print(gas_csmp_predict)
    predict0110.append(gas_csmp_predict)

#     print(xy['Total Population'].iloc[20])
#     exajoules_loop = fitment.predict(xy['Total Exajoules'].to_numpy().reshape(-1, 1))

In [None]:
# Adding historical predictions to view comparison w/ actual values
