In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn

## Notes for group:

- For regression do we only want to use 'response' and year? or include other variables (ie. predict differently for the sources in power generated/consumed or combine all into one prediction? keep as is

- For dataframes with month do we include it? or just summarize to a yearly avg (will do both)

- Some correlation matrices are pretty useless... do we keep all? (I would say yes just to show how year correlates with what we are predicting YES

- On correlation... some of the correlations between year and predictor are ~0... therefore regression model is ineffective. Anything we want to do about that? (Could be just not regressing for that dataset or including other features). No

- Any other plots to include? Could be something just exploratory (histogram of count, etc.) or something related to regression (residuals, CI, etc) histogram

- What do we want to display to summarize regression models? I was thinking R^2 score and the plot of predictions vs true values.

### Reading in and viewing all data frames:

In [None]:
earthtemp = pd.read_csv('clean_data/earthtemp.csv', index_col = 0)
earthtemp.head()

In [None]:
# Need to deal with the total column here
population = pd.read_csv("clean_data/population.csv", index_col = 0)
# population['Total'] = population['Total'].astype(int)
population.head()
population.dtypes

In [None]:
powercons=pd.read_csv('clean_data/powercon.csv', index_col = 0)
powercons.head()

In [None]:
powergen = pd.read_csv('clean_data/powergen.csv', index_col = 0)
powergen.head()

In [None]:
precipitation = pd.read_csv('clean_data/precipitation.csv',index_col = 0)
precipitation.head()

## Exploratory Data Analysis:
For our exploratory data analysis we will view the correlations of the features within each dataset.

In [None]:
# some code from: https://seaborn.pydata.org/generated/seaborn.heatmap.html
def plot_corr(df):
    # Parameters: 
    # df: pd.DataFrame -> the dataframe to be plotted
    # returns a seaborn heatmap representing the correlations of the features
    
    sn.heatmap(df.corr(), cmap = 'magma', annot = True)

In [None]:
plot_corr(earthtemp)

We see correlation between temperature and month, however there is a near zero correlation between year and temperature. This is about as expected, as we know temperature changes greatly with month. 

In [None]:
plot_corr(population)

In [None]:
plot_corr(powercons)

In [None]:
plot_corr(powergen)

In [None]:
plot_corr(precipitation)

Here we see a meaningful correlation between minimum temperature and maximum temperature. The rest of the dataset provides quite weak correlations besides this. 

In [None]:
def plot_bar(data, cat_column, num_column):
    # Parameters:
    # data: pd.DataFrame -> data frame to plot
    # cat_column: String -> The column name (categorical) to group by
    # num_column: String -> The column name (numerical) to aggregate
    # Returns a matplotlib bar chart
    df_grouped = data.groupby(cat_column).agg({num_column:'mean'})
    plt.bar(df_grouped.index, df_grouped[num_column])
    plt.xlabel(cat_column)
    plt.ylabel(num_column)
    
    

In [None]:
# Plotting power consumed by source
plot_bar(powercons, 'source', 'value')

In [None]:
# Plotting power generated by source
plot_bar(powergen, 'source', 'value')

In [None]:
# Plotting average precipitation by station:
plot_bar(precipitation, 'Station', 'ONE_DAY_PRECIPITATION')

# Analysis:
Performing linear regression on each of the data frames and their response variables, and predicting each up to the year 2030. Then we can compare future results to see if the power generated & consumed is truly sustainable. 

In [None]:
# some code from https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
def fit_year_regression_model(data, response):
    # Params: 
    # data: pd.DataFrame -> dataframe to regress on
    # response: string -> the name of the response variable 
    # alpha: float -> the value of alpha to use in the Ridge regressor
    
    # Function fits a Ridge regression model to the specified response from Year, returns the fitted model, 
    # the R^2 score and predictions for the years 2014-2030.
    
    ridge = LinearRegression()
    X = data[['Year']]
    y = data[response]
    ridge.fit(X,y)
    
    score = ridge.score(X,y)
    
    new_years = np.arange(2014, 2035, 1).reshape(-1,1)
    preds = ridge.predict(new_years)
    
    return ridge, score, preds

In [None]:
def plot_old_and_preds(data, preds, model, response):
    # Params:
    # data: pd.DataFrame -> data to plot/predict from
    # preds: np.array -> predictions from previous regression model
    # model: LinearRegression instance -> the model fit to the data
    # response: String -> name of response variable
    
    # Function predicts values for years 2005-2035 using the previously fitted model, then plots the predictions
    # against the true values.
    
    # Returns matplotlib plot
    
    years = np.arange(2005, 2035, 1).reshape(-1,1)
    preds = model.predict(years)
    data_years = data['Year'].values
    data_vals = data[response].values
    
    plt.plot(years, preds, color = 'blue', label = 'Prediction')
    plt.plot(data_years, data_vals, color = 'red', label = 'Real Data')
    plt.xlabel("Year")
    plt.ylabel(response)
    plt.legend()
    
    plt.show()
    

In [None]:
def group_df_by(df, column):
    # Parameters:
    # df: pd.DataFrame -> dataframe to group
    # column: String -> column name to aggregate
    # returns the grouped dataframe
    
    df_grouped = df.groupby("Year").agg({column:'mean'})
    df_grouped = df_grouped.reset_index()
    return df_grouped


In [None]:
# Earth temperature
et_grouped = group_df_by(earthtemp, 'AverageTemperature')

In [None]:
# Fitting temperature model
et_model, et_score, et_preds = fit_year_regression_model(et_grouped, 'AverageTemperature')
et_preds

In [None]:
# Plotting temperature model
plot_old_and_preds(et_grouped, et_preds, et_model, 'AverageTemperature')

In [None]:
# Population dataset
pop_grouped = group_df_by(population, 'Total')
pop_model, pop_score, pop_preds = fit_year_regression_model(pop_grouped, 'Total')
pop_preds

In [None]:
plot_old_and_preds(pop_grouped, pop_preds, pop_model, 'Total')

In [None]:
# Power Generated dataset
pg_grouped = group_df_by(powergen, 'value')
pg_model, pg_score, pg_preds = fit_year_regression_model(pg_grouped, 'value')
pg_preds

In [None]:
plot_old_and_preds(pg_grouped, pg_preds, pg_model, 'value')

In [None]:
# Power Consumed dataset
pc_grouped = group_df_by(powercons, 'value')
pc_model, pc_score, pc_preds = fit_year_regression_model(pc_grouped, 'value')
pc_preds

In [None]:
plot_old_and_preds(pc_grouped, pc_preds, pc_model, 'value')

In [None]:
# Combining predictions and true data in order to compare the trends over time. Here we look at power generated/
# consumed vs population to see if it is 'sustainable'

pop_year_total = pop_grouped[['Year', 'Total']]
pop_year_total = pop_year_total.rename(columns = {'Total':'pop_total'})
pg_year_total = pg_grouped[['Year', 'value']]
pg_year_total = pg_year_total.rename(columns = {'value':'pg_value'})
pc_year_total = pc_grouped[['Year', 'value']]
pc_year_total = pc_year_total.rename(columns = {'value':'pc_value'})
pop_pg = pop_year_total.merge(pg_year_total,on = 'Year')
pop_pg_pc = pop_pg.merge(pc_year_total, on = 'Year')
years = np.arange(2014,2035,1)

new_preds_df = pd.DataFrame({'pc_value':pc_preds, 'pg_value':pg_preds, 'pop_total':pop_preds, 'Year':years})

full_df = pd.concat([pop_pg_pc, new_preds_df])

